首页 > 代码库 > scrapy实战1分布式爬取有缘网:
scrapy实战1分布式爬取有缘网:
直接上代码:
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class YouyuanwangItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 # 个人头像链接 15 header_url=scrapy.Field() 16 # 用户名 17 username=scrapy.Field() 18 # 内心独白 19 monologue=scrapy.Field() 20 # 相册图片链接 21 pic_urls=scrapy.Field() 22 #籍贯 23 place_from=scrapy.Field() 24 #学历 25 education=scrapy.Field() 26 # 年龄 27 age=scrapy.Field() 28 #身高 29 height=scrapy.Field() 30 #工资 31 salary=scrapy.Field() 32 #兴趣爱好 33 hobby=scrapy.Field() 34 # 网站来源 youyuan 35 source=scrapy.Field() 36 # 个人主页源url 37 source_url=scrapy.Field() 38 # 爬虫名 39 spider=scrapy.Field()
spiders >yuoyuan.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import Rule,CrawlSpider 5 #from scrapy_redis.spiders import RedisCrawlSpider 6 from youyuanwang.items import YouyuanwangItem 7 8 9 class YouyuanSpider(CrawlSpider): 10 #class YouyuanSpider(RedisCrawlSpider): 11 name = ‘youyuan‘ 12 allowed_domains = [‘www.youyuan.com‘] 13 # 有缘网的列表页 14 start_urls = [‘http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/‘] 15 #redis_key = ‘YouyuanSpider:start_urls‘ 16 #动态域范围的获取 17 # def __init__(self, *args, **kwargs): 18 # # Dynamically define the allowed domains list. 19 # domain = kwargs.pop(‘domain‘, ‘‘) 20 # self.allowed_domains = filter(None, domain.split(‘,‘)) 21 # super(YouyuanSpider, self).__init__(*args, **kwargs) 22 #匹配全国 23 #list_page = LinkExtractor(allow=(r‘http://www.youyuan.com/find/.+‘)) 24 # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接 25 page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/") 26 # 个人主页 匹配规则,根据response提取链接 27 profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\d+-profile/") 28 29 rules = ( 30 # 匹配列表页成功,跟进链接,跳板 31 Rule(page_links), 32 # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进 33 Rule(profile_page,callback="parse_profile_page",follow=False) 34 ) 35 36 # 处理个人主页信息,得到我们要的数据 37 def parse_profile_page(self, response): 38 item=YouyuanwangItem() 39 # 个人头像链接 40 item[‘header_url‘]=self.get_header_url(response) 41 # 用户名 42 item[‘username‘]=self.get_username(response) 43 #籍贯 44 item[‘place_from‘]=self.get_place_from(response) 45 #学历 46 item[‘education‘]=self.get_education(response) 47 48 # 年龄 49 item[‘age‘]=self.get_age(response) 50 # 身高 51 item[‘height‘]=self.get_height(response) 52 # 工资 53 item[‘salary‘]=self.get_salary(response) 54 # 兴趣爱好 55 item[‘hobby‘]=self.get_hobby(response) 56 # 相册图片链接 57 item[‘pic_urls‘] = self.get_pic_urls(response) 58 # 内心独白 59 item[‘monologue‘] = self.get_monologue(response) 60 # 个人主页源url 61 item[‘source_url‘]=response.url 62 # 网站来源 youyuan 63 item[‘source‘]="youyuan" 64 # 爬虫名 65 item[‘spider‘]="youyuan" 66 yield item 67 #提取头像地址 68 def get_header_url(self,response): 69 header=response.xpath(‘//dl[@class="personal_cen"]/dt/img/@src‘).extract() 70 if len(header): 71 header_url=header[0] 72 else: 73 header_url= "" 74 return header_url.strip() 75 #提取用户名 76 def get_username(self,response): 77 username=response.xpath(‘//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()‘).extract() 78 if len(username): 79 username=username[0] 80 else: 81 username="" 82 return username.strip() 83 #提取年龄 84 def get_age(self,response): 85 age=response.xpath(‘//dl[@class="personal_cen"]//p[@class="local"]/text()‘).extract() 86 if len(age): 87 age=age[0].split()[1] 88 else: 89 age="" 90 return age 91 #提取身高 92 def get_height(self,response): 93 height=response.xpath(‘//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()‘).extract() 94 if len(height): 95 height=height[0] 96 else: 97 height="" 98 99 return height.strip() 100 #提取工资 101 def get_salary(self,response): 102 salary=response.xpath(‘//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()‘).extract() 103 if len(salary): 104 salary=salary[0] 105 else: 106 salary="" 107 return salary.strip() 108 #提取兴趣爱好 109 def get_hobby(self,response): 110 hobby=response.xpath(‘//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()‘).extract() 111 if len(hobby): 112 hobby=",".join(hobby).replace(" ","") 113 else: 114 hobby="" 115 return hobby.strip() 116 #提取相册图片 117 def get_pic_urls(self,response): 118 pic_urls=response.xpath(‘//div[@class="ph_show"]/ul/li/a/img/@src‘).extract() 119 if len(pic_urls): 120 pic_urls=",".join(pic_urls) 121 #将相册url列表转换成字符串 122 else: 123 pic_urls="" 124 return pic_urls 125 #提取内心独白 126 def get_monologue(self,response): 127 monologue=response.xpath(‘//div[@class="pre_data"]/ul/li/p/text()‘).extract() 128 if len(monologue): 129 monologue=monologue[0] 130 else: 131 monologue="" 132 return monologue.strip() 133 #提取籍贯 134 def get_place_from(self,response): 135 place_from=response.xpath(‘//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()‘).extract() 136 if len(place_from): 137 place_from=place_from[0] 138 else: 139 place_from="" 140 return place_from.strip() 141 #提取学历 142 def get_education(self,response): 143 education=response.xpath(‘//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()‘).extract() 144 if len(education): 145 education=education[0] 146 else: 147 education="" 148 return education.strip()
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 # import json 8 # 9 # class YouyuanwangPipeline(object): 10 # def __init__(self): 11 # self.filename=open("youyuanwang.json","wb") 12 # def process_item(self, item, spider): 13 # jsontext=json.dumps(dict(item),ensure_ascii=False) + ",\n" 14 # self.filename.write(jsontext.encode("utf-8")) 15 # return item 16 # def close_spider(self,spider): 17 # self.filename.close() 18 19 import pymysql 20 from .models.es_types import YouyuanType 21 class XiciPipeline(object): 22 def process_item(self, item, spider): 23 # DBKWARGS=spider.settings.get(‘DBKWARGS‘) 24 con=pymysql.connect(host=‘127.0.0.1‘,user=‘root‘,passwd=‘229801‘,db=‘yunyuan‘,charset=‘utf8‘) 25 cur=con.cursor() 26 sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)" 27 "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)") 28 lis=(item[‘header_url‘],item[‘username‘],item[‘monologue‘],item[‘pic_urls‘],item[‘place_from‘],item[‘education‘],item[‘age‘],item[‘height‘],item[‘salary‘],item[‘hobby‘],item[‘source‘]) 29 30 cur.execute(sql,lis) 31 con.commit() 32 cur.close() 33 con.close() 34 return item 35 36 37 38 class ElasticsearchPipeline(object): 39 def process_item(self,item,spider): 40 youyuan = YouyuanType() 41 youyuan.header_url=item["header_url"] 42 youyuan.username=item["username"] 43 youyuan.age=item["age"] 44 youyuan.salary=item["salary"] 45 youyuan.monologue=item["monologue"] 46 youyuan.pic_urls=item["pic_urls"] 47 youyuan.place_from=item["place_from"] 48 49 youyuan.save() 50 51 return item
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for youyuanwang project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = ‘youyuanwang‘ SPIDER_MODULES = [‘youyuanwang.spiders‘] NEWSPIDER_MODULE = ‘youyuanwang.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = ‘youyuanwang (+http://www.yourdomain.com)‘ # Obey robots.txt rules #ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, # ‘Accept-Language‘: ‘en‘, #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # ‘youyuanwang.middlewares.YouyuanwangSpiderMiddleware‘: 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # ‘youyuanwang.middlewares.MyCustomDownloaderMiddleware‘: 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, #} DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" SCHEDULER_PERSIST = True # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { #‘youyuanwang.pipelines.XiciPipeline‘: 300, ‘youyuanwang.pipelines.ElasticsearchPipeline‘: 300, # ‘scrapy_redis.pipelines.RedisPipeline‘:400, } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = ‘httpcache‘ #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
从redis保存到mongodb 在目录下新建文件process_item_mongo.py(名字随便取)
1 #coding=utf-8 2 3 4 import pymongo 5 import redis 6 import json 7 8 def process_item(): 9 Redis_conn=redis.StrictRedis(host=‘127.0.0.1‘,port=6379,db=0) 10 Mongo_conn=pymongo.MongoClient(host=‘127.0.0.1‘,port=27017) 11 db=Mongo_conn["youyuan"] 12 table=db["beijing_18_25"] 13 while True: 14 source, data = http://www.mamicode.com/Redis_conn.blpop(["youyuan:items"]) 15 data = http://www.mamicode.com/json.loads(data.decode("utf-8")) 16 table.insert(data) 17 if __name__=="__main__": 18 process_item()
从redis保存到mysql 在目录下新建文件process_item_mysql.py(名字随便取)
1 #coding=utf-8 2 3 import pymysql 4 import redis 5 import json 6 7 def process_item(): 8 Redis_conn=redis.StrictRedis(host=‘127.0.0.1‘,port=6379,db=0) 9 MySql_conn=pymysql.connect(host=‘127.0.0.1‘,user=‘root‘,passwd=‘229801‘,port=3306,db=‘yunyuan‘) 10 while True: 11 source,data=http://www.mamicode.com/Redis_conn.blpop("youyuan:items") 12 data=http://www.mamicode.com/json.loads(data.decode("utf-8")) 13 cur=MySql_conn.cursor() 14 sql=("insert into youyuanwang(header_url,username,monologue,pic_urls,place_from,education,age,height,salary,hobby,source)" 15 "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)") 16 lis = (data[‘header_url‘], data[‘username‘], data[‘monologue‘], data[‘pic_urls‘], data[‘place_from‘], 17 data[‘education‘], data[‘age‘], data[‘height‘], data[‘salary‘], data[‘hobby‘], data[‘source‘]) 18 cur.execute(sql,lis) 19 MySql_conn.commit() 20 cur.close() 21 MySql_conn.close() 22 if __name__=="__main__": 23 process_item()
数据:
申明:以上只限于参考学习交流!!!
scrapy实战1分布式爬取有缘网:
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。