首页 > 代码库 > 一个简单的Python网络爬虫(抓图),针对某论坛.

一个简单的Python网络爬虫(抓图),针对某论坛.

 1 #coding:utf-8 2 import urllib2 3 import re 4 import threading 5  6 #图片下载 7 def loadImg(addr,x,y,artName): 8     data =http://www.mamicode.com/ urllib2.urlopen(addr).read() 9     f = open(artName.decode("utf-8")+str(y)+.jpg, wb)10     f.write(data)11     f.close()12     13 #具体帖子页面解析,得到图片链接地址,并使用loadImg下载 artName为帖子名14 def getImgLink(html,x,artName):15     relink = <img src="http://www.mamicode.com/.*" file="(.*)" width=".*" id=".*" alt=".*.jpg" />16     cinfo = re.findall(relink,html)17     y = 018     for lin in cinfo:19         imgAddr =  http://www.xxx.com/+lin20         print "LoadImg:"+str(x),imgAddr+\n21         t = threading.Thread(target=loadImg(imgAddr,x,y,artName)) #使用threading 多线程下载22         t.start()23         y = y+124         25 #论坛版块页面解析,得到具体帖子链接        26 def getArticleLink(html,page):27     relink = <a href="http://www.mamicode.com/(viewthread/.php/?tid=.*3D.*)">(.*)</a>28     cinfo = re.findall(relink,html)29     x = 130     for lin in cinfo:31         #print lin,‘\n‘32         url="http://www.xxx.com/"+lin[0]33         headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}34         req = urllib2.Request(url,headers=headers)35         response= urllib2.urlopen(req)36         html = response.read()37         getImgLink(html,x,lin[1])38         x = x+139         40 start = 1 #起始页41 end = 100 #终止页42 for page in range(end):43     url="http://www.xxx.com/forumdisplay.php?fid=19&page="+str(page+start)44     headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}45     req = urllib2.Request(url,headers=headers)46     response= urllib2.urlopen(req)47     html = response.read()48     printStart49     getArticleLink(html,page)

 

一个简单的Python网络爬虫(抓图),针对某论坛.