首页 > 代码库 > Python实例 -- 爬虫

Python实例 -- 爬虫

 1 #coding="utf-8" 2  3 import urllib2 4 import re 5 import threading 6 import time 7  8 """ 9 抓取代理发布页的ip和port10 http://www.xici.net.co/nn/%d11 """12 13 proxylist = []14 15 16 def get_proxy_from_cnproxy():17     global proxylist18     19     p = re.compile(r<td><img alt="(.+?)" src="http://www.mamicode.com/.+?" /></td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>[\s\S]*?<a href="http://www.mamicode.com/.+?">.+?</a>[\s\S]*?</td>[\s\S]*?<td>.+?</td>[\s\S]*?<td>(.+?)</td>)20     21     for i in range(1,2):22         target = r"http://www.xici.net.co/nn/%d" %i23         print target24         req = urllib2.urlopen(target)25         result = req.read()26         matchs = p.findall(result)27         for record in matchs:28             addr = record[0]29             ip = record[1]30             port = record[2]31             protocol = record[3]32             l = [ip, port, protocol, addr]33             #print l34             proxylist.append(l)    35         print proxylist36 37 38 class ProxyCheck(threading.Thread):39     def __init__(self, proxylist, fname):40         threading.Thread.__init__(self)41         self.proxylist = proxylist42         self.timeout = 543         self.test_url = "http://www.baidu.com/"44         self.test_str = "030173"45         self.checkedPProxyList = []46         self.fname = fname47         48     def checkProxy(self):49         cookies = urllib2.HTTPCookieProcessor()50         for proxy in self.proxylist:51             proxy_handler = urllib2.ProxyHandler({"http":rhttp://%s:%s%(proxy[0],proxy[1])})52             opener = urllib2.build_opener(cookies, proxy_handler)53             opener.addheaders = [(user-agent, mozilla/5.0(iphone; u; cpu like mac os x; en) applewebkit/420+ (khtml, like gecko) version/3.0 mobile/1A537a safari/419.3)]54             urllib2.install_opener(opener)55             t1 = time.time()56             try:57                 req = urllib2.urlopen(self.test_url, timeout = self.timeout)58                 result = req.read()59                 timeused = time.time() - t160                 pos = result.find(self.test_str)61                 if pos > 1:62                     self.checkedPProxyList.append([proxy[0],proxy[1],proxy[2],proxy[3],timeused])63                 else:64                     continue;65             except Exception,e:66                 print e.message67                 continue;68             69     def sort(self):70         sorted(self.checkedPProxyList,cmp=lambda x,y:cmp(x[4],y[4]))71     72     def save(self):73         f = open(self.fname, w+)74         for proxy in self.checkedPProxyList:75             f.write("%s:%s\t%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3],str(proxy[4])))76         f.close()77         78     def run(self):79         self.checkProxy()80         self.sort()81         self.save()82         83 84 if __name__ == "__main__":85     get_proxy_from_cnproxy()86     t1 = ProxyCheck(proxylist,"test.txt")87     t1.start()