首页 > 代码库 > Python实例 -- 爬虫
Python实例 -- 爬虫
1 #coding="utf-8" 2 3 import urllib2 4 import re 5 import threading 6 import time 7 8 """ 9 抓取代理发布页的ip和port10 http://www.xici.net.co/nn/%d11 """12 13 proxylist = []14 15 16 def get_proxy_from_cnproxy():17 global proxylist18 19 p = re.compile(r‘<td><img alt="(.+?)" src="http://www.mamicode.com/.+?" /></td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>[\s\S]*?<a href="http://www.mamicode.com/.+?">.+?</a>[\s\S]*?</td>[\s\S]*?<td>.+?</td>[\s\S]*?<td>(.+?)</td>‘)20 21 for i in range(1,2):22 target = r"http://www.xici.net.co/nn/%d" %i23 print target24 req = urllib2.urlopen(target)25 result = req.read()26 matchs = p.findall(result)27 for record in matchs:28 addr = record[0]29 ip = record[1]30 port = record[2]31 protocol = record[3]32 l = [ip, port, protocol, addr]33 #print l34 proxylist.append(l) 35 print proxylist36 37 38 class ProxyCheck(threading.Thread):39 def __init__(self, proxylist, fname):40 threading.Thread.__init__(self)41 self.proxylist = proxylist42 self.timeout = 543 self.test_url = "http://www.baidu.com/"44 self.test_str = "030173"45 self.checkedPProxyList = []46 self.fname = fname47 48 def checkProxy(self):49 cookies = urllib2.HTTPCookieProcessor()50 for proxy in self.proxylist:51 proxy_handler = urllib2.ProxyHandler({"http":r‘http://%s:%s‘%(proxy[0],proxy[1])})52 opener = urllib2.build_opener(cookies, proxy_handler)53 opener.addheaders = [(‘user-agent‘, ‘mozilla/5.0(iphone; u; cpu like mac os x; en) applewebkit/420+ (khtml, like gecko) version/3.0 mobile/1A537a safari/419.3‘)]54 urllib2.install_opener(opener)55 t1 = time.time()56 try:57 req = urllib2.urlopen(self.test_url, timeout = self.timeout)58 result = req.read()59 timeused = time.time() - t160 pos = result.find(self.test_str)61 if pos > 1:62 self.checkedPProxyList.append([proxy[0],proxy[1],proxy[2],proxy[3],timeused])63 else:64 continue;65 except Exception,e:66 print e.message67 continue;68 69 def sort(self):70 sorted(self.checkedPProxyList,cmp=lambda x,y:cmp(x[4],y[4]))71 72 def save(self):73 f = open(self.fname, ‘w+‘)74 for proxy in self.checkedPProxyList:75 f.write("%s:%s\t%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3],str(proxy[4])))76 f.close()77 78 def run(self):79 self.checkProxy()80 self.sort()81 self.save()82 83 84 if __name__ == "__main__":85 get_proxy_from_cnproxy()86 t1 = ProxyCheck(proxylist,"test.txt")87 t1.start()
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。