基于Python的urllib2模块的多线程网络爬虫程序

首页 > 代码库 > 基于Python的urllib2模块的多线程网络爬虫程序

基于Python的urllib2模块的多线程网络爬虫程序

2024-07-11 07:59:21 227人阅读

  1 m Queue import Queue  2 from gzip import GzipFile  3 from StringIO import StringIO  4 import time  5 import socket  6 class ContentEncodingProcessor(urllib2.BaseHandler):  7   """A handler to add gzip capabilities to urllib2 requests """  8    9   # add headers to requests 10   def http_request(self, req): 11     req.add_header("Accept-Encoding", "gzip, deflate") 12     return req 13   14   # decode 15   def http_response(self, req, resp): 16     old_resp = resp 17      18    # if(resp.geturl() != req): 19     #    print ‘no‘ 20      #   return 1 21     # gzip 22     if resp.headers.get("content-encoding") == "gzip": 23         gz = GzipFile( 24                     fileobj=StringIO(resp.read()), 25                     mode="r" 26                   ) 27         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) 28         resp.msg = old_resp.msg 29     # deflate 30     if resp.headers.get("content-encoding") == "deflate": 31         gz = StringIO( deflate(resp.read()) ) 32         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # ‘class to add info() and 33         resp.msg = old_resp.msg 34     return resp 35  36 # deflate support 37 import zlib 38 def deflate(data):   # zlib only provides the zlib compress format, not the deflate format; 39   try:               # so on top of all there‘s this workaround: 40     return zlib.decompress(data, -zlib.MAX_WBITS) 41   except zlib.error: 42     return zlib.decompress(data) 43  44  45 #(set timeout) 46 socket.setdefaulttimeout(10) 47  48 encoding_support = ContentEncodingProcessor 49 opener = urllib2.build_opener( encoding_support, urllib2.HTTPHandler) 50  51 class Fetcher: 52     def __init__(self,threads): 53         self.opener = urllib2.build_opener(urllib2.HTTPHandler) 54         self.lock = Lock() #线程锁 55         self.q_req = Queue() #任务队列 56         self.q_ans = Queue() #完成队列import socket 57         self.threads = threads 58         for i in range(threads): 59             t = Thread(target=self.threadget) 60             t.setDaemon(True) 61             t.start() 62         self.running = 0 63   64     def __del__(self): #解构时需等待两个队列完成 65         time.sleep(0.5) 66         self.q_req.join() 67         self.q_ans.join() 68   69     def taskleft(self): 70         return self.q_req.qsize()+self.q_ans.qsize()+self.running 71   72     def push(self,req): 73         self.q_req.put(req) 74   75     def pop(self): 76         return self.q_ans.get() 77   78     def threadget(self): 79         while True: 80             ans = ‘‘ 81             req = self.q_req.get() 82      #       print req 83  84             with self.lock: #要保证该操作的原子性，进入critical area 85                 self.running += 1 86  87             try: 88 #               ans = self.opener.open(req).read() 89                #content =  opener.open(req).read() 90                 content = urllib2.urlopen(req).read() 91             #    print temp.geturl() 92             #    print req 93             #    add gzip support from here 94                 ans = str(content) 95             except Exception, what: 96                 print what  97                 pass 98  99             self.q_ans.put((ans,req))100             with self.lock:101                 self.running -= 1102             self.q_req.task_done()103             time.sleep(0.01) # don‘t spam104  105 if __name__ == "__main__":106     a = [0] * 3600000107     links = [ ‘http://www.songtaste.com/song/%d/‘%i for i in range(1,3600000) ]108     f = Fetcher(threads=50)109     for url in links:110         f.push(url)111     while f.taskleft():  112         the_page,x =f.pop()113        # print the_page114         try:115           npos = the_page.index(‘chart#fav‘)116         except :117           pass118         else:119            for j in range(npos,1,-1):120             if the_page[j] == ‘,‘: 121                 k = j 122                 break123            sum = 0 ;124            t = 1 ; 125            for j in range(k-1,1,-1):126               if  the_page[j] <= ‘9‘ and the_page[j] >=‘0‘:127                   sum = sum + (int(the_page[j]) - int(‘0‘)) * t128                   t *= 10;129               else :130                   break131            p = int(x[30:-1])132            if(p % 10000 <= 5  )133            a[p] = sum134            if sum != 0:135                 print p136                 print sum137

View Code

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 基于Python的urllib2模块的多线程网络爬虫程序

基于Python的urllib2模块的多线程网络爬虫程序

看完仍有疑问？有类似问题直接问程序猿