首页 > 代码库 > 多线程网页爬虫 python 实现(二)
多线程网页爬虫 python 实现(二)
#!/usr/bin/env python#coding=utf-8import threadingimport urllibimport reimport timecur=0last=0totalcount=0depth=0t_mutex=threading.Condition() class Mycrawler: def __init__(self,crawlername,seeds,threadnum): self.crawlername=crawlername self.seeds=seeds self.crawqueue=CrawQueue() self.initQueue(self.seeds) self.threadnum=threadnum self.threadpools=[] self.logfile=file(‘log2.txt‘,‘w‘) def initQueue(self,seeds): if isinstance(seeds,str): self.crawqueue.push(seeds) elif isinstance(seeds,list): for seed in seeds: self.crawqueue.push(seed) global last global totalcount totalcount=self.crawqueue.getQueueCount() last=totalcount def crawling(self): global cur global depth global last global totalcount self.log(">>>Depth "+str(depth)+":\n") while self.crawqueue.getQueueCount()!=0: url=self.crawqueue.pop() self.log(url) if url==None: continue self.crawqueue.addToVisited(url) links=self.getLinks(url) if links==None: print ‘None‘ self.crawqueue.failed.append(url) continue beforenum = self.crawqueue.getQueueCount() self.crawqueue.addLinks(links) afternum = self.crawqueue.getQueueCount() totalcount+=afternum-beforenum cur+=1 if cur==last: depth+=1 self.log(">>>Depth "+str(depth)+":\n") last=totalcount def crawling2(self): global last global totalcount global depth self.log(">>>Depth "+str(depth)+":\n") totalcount=self.crawqueue.getQueueCount() last=totalcount while self.crawqueue.getQueueCount()!=0: for i in range(self.threadnum): url=self.crawqueue.pop() if url==None: break crawthread=crawlerThread(url,i,self) self.threadpools.append(crawthread) crawthread.start() for i in range(len(self.threadpools)): crawthread=self.threadpools[i] crawthread.join(30) def log(self,content): self.logfile.write(content+"\n")class crawlerThread(threading.Thread): def __init__(self,url,tid,mycrawler): threading.Thread.__init__(self) self.url=url self.tid=tid self.mycrawler=mycrawler def run(self): global t_mutex global cur global last global totalcount global depth t_mutex.acquire() self.mycrawler.log(self.url) t_mutex.release() links=self.getLinks(self.url) if links==None: t_mutex.acquire() self.mycrawler.crawqueue.addToVisited(self.url) self.mycrawler.crawqueue.addToFailed(self.url) t_mutex.release() else: t_mutex.acquire() self.mycrawler.crawqueue.addToVisited(self.url) beforenum=self.mycrawler.crawqueue.getQueueCount() self.mycrawler.crawqueue.addLinks(links) afternum =self.mycrawler.crawqueue.getQueueCount() totalcount+=afternum-beforenum t_mutex.release() t_mutex.acquire() cur+=1 if cur==last: depth+=1 self.mycrawler.log(">>>Depth "+str(depth)+":\n") last=totalcount t_mutex.release() def getLinks(self,url): try: page=urllib.urlopen(url) html=page.read() reg=r‘"(http://.+?)"‘ regob=re.compile(reg,re.DOTALL) links=regob.findall(html) return links except: print ‘Failed downloading and saving‘,url return Noneclass CrawQueue: def __init__(self): self.queue=[] self.visited=[] self.failed=[] def getQueue(self): return self.queue def getVisited(self): return self.visited def getFailed(self): return self.failed def push(self,url): if url!="" and url not in self.queue and url not in self.visited: self.queue.insert(0,url) def pop(self): if len(self.queue)==0: #print ‘failed to pop: queue is empty‘ return None else: return self.queue.pop() def isEmpty(self): if len(self.queue)==0: return 1 else: return 0 def addToVisited(self,url): self.visited.append(url) def addToFailed(self,url): self.failed.append(url) def remove(self,url): self.queue.remove(url) def getVisitedCount(self): return len(self.visited) def getQueueCount(self): return len(self.queue) def addLinks(self,links): for link in links: self.push(link)if __name__=="__main__": seeds="http://www.douban.com/" threadnum=int(raw_input("设置线程数:")) crawlername="小小爬虫" mycrawler=Mycrawler(crawlername,seeds,threadnum) mycrawler.crawling2()
多线程网页爬虫 python 实现(二)
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。