Leave a comment (1) 作者:小项-怪物猪

#!/usr/bin/envpython
#-*-coding:utf-8-*-
#-*-encoding=utf-8
   
#--作者:小项--
#--预览:http://www.20hotel.com/news--
   
importsys;
importos;
importre;
importrandom;
importurllib2;
importtime;
importdatetime;
#importsocket;
importMySQLdbasmysql;
   
reload(sys)
   
sys.setdefaultencoding('utf-8')
   
#--转到目录--
os.chdir('img')
   
#urllib2.socket.setdefaulttimeout(15)
   
User='username'
Passwd='password'
Host='localhost'
Db='dbname'
   
home="http://www.8264.com/"
   
#--链接数据库--
contents=mysql.connect(user=User,passwd=Passwd,host=Host,db=Db,charset='utf8').cursor()
   
lsid=[]
   
pnext=[]
   
forsidinxrange(1,100,10):
   lsid.append(str(sid))
   
print"进行列表分段",lsid,"完成."
fortidinreversed(xrange(2,len(lsid)+1)):
   foriinreversed(xrange(int(lsid[(int(tid)-2):(int(tid)-1)][0]),int(lsid[(int(tid)-1):int(tid)][0]))):
   #printi
   #==进行列表获取==#
   request=urllib2.Request("http://www.8264.com/portal-list-catid-251-page-"+str(i)+".html")
   request.add_header('User-Agent','Mozilla/5.0(compatible;Googlebot/2.1;+http://www.google.com/bot.html)')
   foruinreversed(re.findall('<h2><ahref=\"(.*?)\"title=\'',re.findall('<divclass=\"title_8264\">(.*?)<divclass=\"pg\">',urllib2.urlopen(request).read(),re.DOTALL)[0],re.DOTALL)):
   #printu
   #--获取内容页面--
   newsurl=urllib2.Request(u)
   newsurl.add_header('User-Agent','Mozilla/5.0(compatible;Googlebot/2.1;+http://www.google.com/bot.html)')
   news=urllib2.urlopen(newsurl).read()
   time.sleep(int(random.uniform(1,5)))
   #--获取标题--
   title=re.findall('<divclass=\"newstitle\">(.*?)<\/div>',news,re.DOTALL)
   #--获取时间--
   dates=list(eval(re.sub('\,0',',',re.sub(':||-',',',re.findall('<tdalign=\"center\"valign=\"middle\">.*?<divstyle=\"line-height:1.8;text-align:center;\">\xcc\xed\xbc\xd3\xca\xb1\xbc\xe4\xa3\xba(.*?)',news,re.DOTALL)[0]))))
   #--进行时间格式化--
   #--2011-05-1008:19to1305010787.029--
   ttime=datetime.datetime(dates[0],dates[1],dates[2],dates[3],dates[4])
   ptime=time.mktime(ttime.timetuple())
   
   #--获取作者--
   athour=re.sub('<.*?>','',re.findall('\xd7\xf7\xd5\xdf\xa3\xba(.*?)<br\/><a',news,re.DOTALL)[0])
   
   #--获取分页链接--
   page=re.findall('<divclass=\"pg\">(.*?)<\/div>',news,re.DOTALL)
   ifpage!=[]:
   pnext=re.findall('<ahref=\"(.*?)\">[0-9]*<\/a>',page[0],re.DOTALL)
   one_img=[]
   one_txt=re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<divclass=\"newstext\">(.*?)<\/div>',news,re.DOTALL)[0])
   newstxt=re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',one_txt)
   one_img.extend(re.findall('<IMGsrc=\"(.*?)\">',one_txt,re.DOTALL))
   forone_dimginone_img:
   #--下载文章内图片--
   one_yscurl='wget-q'+one_dimg
   os.system(one_yscurl)
   forpinpnext:
   #printp,"\n"
   more_img=[]
   morepage=urllib2.Request(p)
   morepage.add_header('User-Agent','Mozilla/5.0(compatible;Googlebot/2.1;+http://www.google.com/bot.html)')
   pnewtxt=urllib2.urlopen(morepage).read()
   txt=re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<divclass=\"newstext\">(.*?)<\/div>',pnewtxt,re.DOTALL)[0])
   #--得到入库的内容--
   ntxt=re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',txt)
   #--处理内容中的图片--
   more_img.extend(re.findall('<IMGsrc=\"(.*?)\">',txt,re.DOTALL))
   formore_dimginmore_img:
   more_syscurl='wget-q'+more_dimg
   os.system(more_syscurl)
   
   newstxt+=ntxt
   texts=title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime
   #--进行数据插入--
   contents.execute("INSERTINTO`dbname`.`table_name`(`aid`,`class_id`,`title`,`content`,`author`,`order`,`state_radio`,`time`,`view_num`,`img`,`CityID`)VALUES(NULL,'2',%s,%s,%s,'0','2',%s,'0','','53');",texts);
   printathour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!"
   time.sleep(int(random.uniform(30,90)))
   else:
   #pass
   only_img=[]
   only_txt=re.sub('<[a|A].*?>|<\/[a|A]>','',re.findall('<divclass=\"newstext\">(.*?)<\/div>',news,re.DOTALL)[0])
   newstxt=re.sub('[http:\/\/image.8264.com\/portal\/[0-9]*\/[0-9]*\/|http:\/\/image.8264.com\/portal\/photo\/[0-9]*\/[0-9]*\/]','',only_txt)
   only_img.extend(re.findall('<IMGsrc=\"(.*?)\">',only_txt,re.DOTALL))
   foronly_imginonly_img:
   only_syscurl='wget-q'+only_img
   os.system(only_syscurl)
   texts=title[0].decode('gbk','ignore').encode('utf-8'),newstxt.decode('gbk','ignore').encode('utf-8'),athour.decode('gbk','ignore').encode('utf-8'),ptime
   contents.execute("INSERTINTO`dbname`.`table_name`(`aid`,`class_id`,`title`,`content`,`author`,`order`,`state_radio`,`time`,`view_num`,`img`,`CityID`)VALUES(NULL,'2',%s,%s,%s,'0','2',%s,'0','','53');",texts);
   printathour.decode('gbk','ignore').encode('utf-8'),"在",tuple(dates),"发表的",title[0].decode('gbk','ignore').encode('utf-8'),"发布成功!"
   time.sleep(int(random.uniform(30,90)))
   
   print"第",i,"页采集完成.休息一下,进入下一页采集."
   #--停顿一会--
   time.sleep(int(random.uniform(1200,3200)))
#--关闭数据库连接--
contents.close();
分类:Ubuntu | 标签: 采集
现在有 1 条评论啦 »
  1. Simon Simon

    我是一个Python初学者,因为最近要买房子,所以想先做些研究工作,阅读相关信息。

    我的问题是这样的:

    如果我以”开发商猫腻“为关键字,在百度搜索相关文章,并且需要保存为文本。该怎么写这个脚本,或者说有没有什么现成的库可以用。

    谢谢

打劫!!留下你的评论 »

表情 emoticons

:?::razz::sad::evil::!::smile::oops::grin::eek::shock::???::cool::lol::mad::twisted::roll::wink::idea::arrow::neutral::cry::mrgreen:
  • 关闭

[ 搜狗云输入法 ]