首页 > 代码库 > 基于Python的urllib2模块的多线程网络爬虫程序
基于Python的urllib2模块的多线程网络爬虫程序
1 m Queue import Queue 2 from gzip import GzipFile 3 from StringIO import StringIO 4 import time 5 import socket 6 class ContentEncodingProcessor(urllib2.BaseHandler): 7 """A handler to add gzip capabilities to urllib2 requests """ 8 9 # add headers to requests 10 def http_request(self, req): 11 req.add_header("Accept-Encoding", "gzip, deflate") 12 return req 13 14 # decode 15 def http_response(self, req, resp): 16 old_resp = resp 17 18 # if(resp.geturl() != req): 19 # print ‘no‘ 20 # return 1 21 # gzip 22 if resp.headers.get("content-encoding") == "gzip": 23 gz = GzipFile( 24 fileobj=StringIO(resp.read()), 25 mode="r" 26 ) 27 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) 28 resp.msg = old_resp.msg 29 # deflate 30 if resp.headers.get("content-encoding") == "deflate": 31 gz = StringIO( deflate(resp.read()) ) 32 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # ‘class to add info() and 33 resp.msg = old_resp.msg 34 return resp 35 36 # deflate support 37 import zlib 38 def deflate(data): # zlib only provides the zlib compress format, not the deflate format; 39 try: # so on top of all there‘s this workaround: 40 return zlib.decompress(data, -zlib.MAX_WBITS) 41 except zlib.error: 42 return zlib.decompress(data) 43 44 45 #(set timeout) 46 socket.setdefaulttimeout(10) 47 48 encoding_support = ContentEncodingProcessor 49 opener = urllib2.build_opener( encoding_support, urllib2.HTTPHandler) 50 51 class Fetcher: 52 def __init__(self,threads): 53 self.opener = urllib2.build_opener(urllib2.HTTPHandler) 54 self.lock = Lock() #线程锁 55 self.q_req = Queue() #任务队列 56 self.q_ans = Queue() #完成队列import socket 57 self.threads = threads 58 for i in range(threads): 59 t = Thread(target=self.threadget) 60 t.setDaemon(True) 61 t.start() 62 self.running = 0 63 64 def __del__(self): #解构时需等待两个队列完成 65 time.sleep(0.5) 66 self.q_req.join() 67 self.q_ans.join() 68 69 def taskleft(self): 70 return self.q_req.qsize()+self.q_ans.qsize()+self.running 71 72 def push(self,req): 73 self.q_req.put(req) 74 75 def pop(self): 76 return self.q_ans.get() 77 78 def threadget(self): 79 while True: 80 ans = ‘‘ 81 req = self.q_req.get() 82 # print req 83 84 with self.lock: #要保证该操作的原子性,进入critical area 85 self.running += 1 86 87 try: 88 # ans = self.opener.open(req).read() 89 #content = opener.open(req).read() 90 content = urllib2.urlopen(req).read() 91 # print temp.geturl() 92 # print req 93 # add gzip support from here 94 ans = str(content) 95 except Exception, what: 96 print what 97 pass 98 99 self.q_ans.put((ans,req))100 with self.lock:101 self.running -= 1102 self.q_req.task_done()103 time.sleep(0.01) # don‘t spam104 105 if __name__ == "__main__":106 a = [0] * 3600000107 links = [ ‘http://www.songtaste.com/song/%d/‘%i for i in range(1,3600000) ]108 f = Fetcher(threads=50)109 for url in links:110 f.push(url)111 while f.taskleft(): 112 the_page,x =f.pop()113 # print the_page114 try:115 npos = the_page.index(‘chart#fav‘)116 except :117 pass118 else:119 for j in range(npos,1,-1):120 if the_page[j] == ‘,‘: 121 k = j 122 break123 sum = 0 ;124 t = 1 ; 125 for j in range(k-1,1,-1):126 if the_page[j] <= ‘9‘ and the_page[j] >=‘0‘:127 sum = sum + (int(the_page[j]) - int(‘0‘)) * t128 t *= 10;129 else :130 break131 p = int(x[30:-1])132 if(p % 10000 <= 5 )133 a[p] = sum134 if sum != 0:135 print p136 print sum137
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。