首页 > 代码库 > 多线程网页爬虫 python 实现(二)

多线程网页爬虫 python 实现(二)

#!/usr/bin/env python#coding=utf-8import threadingimport urllibimport reimport timecur=0last=0totalcount=0depth=0t_mutex=threading.Condition() class Mycrawler:    def __init__(self,crawlername,seeds,threadnum):        self.crawlername=crawlername        self.seeds=seeds        self.crawqueue=CrawQueue()        self.initQueue(self.seeds)        self.threadnum=threadnum        self.threadpools=[]        self.logfile=file(‘log2.txt‘,‘w‘)    def initQueue(self,seeds):        if isinstance(seeds,str):            self.crawqueue.push(seeds)        elif isinstance(seeds,list):            for seed in seeds:                self.crawqueue.push(seed)        global last        global totalcount        totalcount=self.crawqueue.getQueueCount()        last=totalcount    def crawling(self):        global cur        global depth        global last        global totalcount        self.log(">>>Depth "+str(depth)+":\n")        while self.crawqueue.getQueueCount()!=0:            url=self.crawqueue.pop()            self.log(url)            if url==None:                continue            self.crawqueue.addToVisited(url)            links=self.getLinks(url)            if links==None:                print ‘None‘                self.crawqueue.failed.append(url)                continue            beforenum = self.crawqueue.getQueueCount()            self.crawqueue.addLinks(links)            afternum  = self.crawqueue.getQueueCount()            totalcount+=afternum-beforenum            cur+=1            if cur==last:                depth+=1                self.log(">>>Depth "+str(depth)+":\n")                last=totalcount    def crawling2(self):        global last        global totalcount        global depth        self.log(">>>Depth "+str(depth)+":\n")        totalcount=self.crawqueue.getQueueCount()        last=totalcount        while self.crawqueue.getQueueCount()!=0:            for i in range(self.threadnum):                url=self.crawqueue.pop()                if url==None:                    break                crawthread=crawlerThread(url,i,self)                self.threadpools.append(crawthread)                crawthread.start()            for i in range(len(self.threadpools)):                crawthread=self.threadpools[i]                crawthread.join(30)            def log(self,content):        self.logfile.write(content+"\n")class crawlerThread(threading.Thread):    def __init__(self,url,tid,mycrawler):        threading.Thread.__init__(self)        self.url=url        self.tid=tid        self.mycrawler=mycrawler    def run(self):        global t_mutex        global cur        global last        global totalcount        global depth        t_mutex.acquire()        self.mycrawler.log(self.url)        t_mutex.release()        links=self.getLinks(self.url)        if links==None:            t_mutex.acquire()            self.mycrawler.crawqueue.addToVisited(self.url)            self.mycrawler.crawqueue.addToFailed(self.url)            t_mutex.release()        else:            t_mutex.acquire()            self.mycrawler.crawqueue.addToVisited(self.url)            beforenum=self.mycrawler.crawqueue.getQueueCount()            self.mycrawler.crawqueue.addLinks(links)            afternum =self.mycrawler.crawqueue.getQueueCount()            totalcount+=afternum-beforenum            t_mutex.release()        t_mutex.acquire()        cur+=1        if cur==last:            depth+=1            self.mycrawler.log(">>>Depth "+str(depth)+":\n")            last=totalcount        t_mutex.release()    def getLinks(self,url):        try:            page=urllib.urlopen(url)                html=page.read()            reg=r‘"(http://.+?)"‘            regob=re.compile(reg,re.DOTALL)            links=regob.findall(html)            return links        except:            print ‘Failed downloading and saving‘,url            return Noneclass CrawQueue:    def __init__(self):        self.queue=[]        self.visited=[]        self.failed=[]    def getQueue(self):        return self.queue    def getVisited(self):        return self.visited    def getFailed(self):        return self.failed    def push(self,url):        if url!="" and url not in self.queue and url not in self.visited:             self.queue.insert(0,url)    def pop(self):        if len(self.queue)==0:            #print ‘failed to pop: queue is empty‘            return None        else:            return self.queue.pop()    def isEmpty(self):        if len(self.queue)==0:            return 1        else:            return 0    def addToVisited(self,url):        self.visited.append(url)    def addToFailed(self,url):        self.failed.append(url)    def remove(self,url):        self.queue.remove(url)    def getVisitedCount(self):        return len(self.visited)    def getQueueCount(self):        return len(self.queue)    def addLinks(self,links):        for link in links:            self.push(link)if __name__=="__main__":    seeds="http://www.douban.com/"    threadnum=int(raw_input("设置线程数:"))    crawlername="小小爬虫"    mycrawler=Mycrawler(crawlername,seeds,threadnum)    mycrawler.crawling2()            

 

多线程网页爬虫 python 实现(二)