首页 > 代码库 > ip代理池-基于mongodb数据库
ip代理池-基于mongodb数据库
代码用的python2.7,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。下面直接上代码
1 #-*-encoding=utf-8-*- 2 3 import requests 4 from lxml import etree 5 import time 6 import pymongo 7 from multiprocessing import Pool 8 9 10 class Getproxy(object): 11 def __init__(self): 12 self.headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36‘} 13 self.url = ‘http://www.xicidaili.com/wt/‘ 14 self.client = pymongo.MongoClient(‘localhost‘,27017) 15 self.xici = self.client[‘xici‘] 16 self.xiciipinfo =self.xici[‘xiciipinfo‘] 17 #self.removeip = ‘127.0.0.1‘ #第一次运行会检测该变量,因为下面只有检测失败了才会赋值 18 19 def getip(self,num): 20 #爬西祠所有代理,更新放入数据库 21 url = self.url + str(num) 22 wb_data = http://www.mamicode.com/requests.get(url, headers= self.headers) 23 html = etree.HTML(wb_data.text) 24 # htmls = etree.tostring(html) 25 ips = html.xpath(‘//tr[@class="odd"]/td[2]/text()‘) 26 ports = html.xpath(‘//tr[@class="odd"]/td[3]/text()‘) 27 protocols = html.xpath(‘//tr[@class="odd"]/td[6]/text()‘) 28 areas = html.xpath(‘//tr[@class="odd"]/td[4]/a/text()‘) 29 for ip, port, protocol, area in zip(ips, ports, protocols, areas): 30 data =http://www.mamicode.com/ { 31 ‘ip‘: ip, 32 ‘port‘: port, 33 ‘protocol‘: protocol, 34 ‘area‘: area, 35 } 36 print data 37 #self.xiciipinfo.insert_one(data) 38 #if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间 39 self.xiciipinfo.update({‘ip‘:ip}, {‘$set‘:data}, True) 40 41 42 def count(self,num): 43 for i in range(1,num): 44 self.getip(i) 45 time.sleep(2) 46 47 48 def dbclose(self): 49 self.client.close() 50 51 52 def getiplist(self): 53 # 将数据库内数据整理放入列表 54 ips = self.xiciipinfo.find() 55 proxylist = [] 56 for i in ips: 57 b = "http" + "://" + i[‘ip‘] + ":" + i[‘port‘] 58 proxies = {"http": b} 59 # print proxies 60 proxylist.append(proxies) 61 # print proxylist 62 return proxylist 63 64 def iptest(self, proxy): 65 # 检测ip,并更新进入数据库,删掉不可用的ip 66 ip = proxy[‘http‘][7:].split(‘:‘)[0] 67 try: 68 requests.get(‘http://wenshu.court.gov.cn/‘, proxies=proxy, timeout = 6) 69 except: 70 print ‘field...............>>>>>>>>>>>>>>>>>>>>>>>>‘ 71 #self.removeip = ip #赋值给类属性 72 self.xiciipinfo.remove({‘ip‘: ip}) # 用remove方法,将符合条件的删掉 73 print ‘remove it now.....{}‘.format(ip) 74 else: 75 print ‘<<<<<<<<<<<<<<<<<.............success‘ 76 print proxy 77 78 79 if __name__ == ‘__main__‘: 80 pool = Pool() 81 proxy = Getproxy() 82 proxy.count(2) 83 iplist = proxy.getiplist() 84 map(proxy.iptest, iplist) 85 proxy.dbclose()
ip代理池-基于mongodb数据库
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。