首页 > 代码库 > scrapy 知乎关键字爬虫spider代码

scrapy 知乎关键字爬虫spider代码

以下是spider部分的代码。爬知乎是需要登录的,建议使用cookie就可以了,如果需要爬的数量预计不多,请不要使用过大的线程数量,否则会过快的被封杀,需要等十几个小时账号才能使用,得不偿失。
  1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from scrapy import Request
  4 from scrapy import log
  5 import logging
  6 #from zhihu.items import ZhihuItem
  7 from zhihu.items import ZhihuItem
  8 from scrapy_redis.spiders import RedisSpider
  9 import re
 10 import json
 11 import time
 12 
 13 class BaoxianSpider(RedisSpider):
 14 
 15     name = "baoxian"
 16     allowed_domains = ["zhihu.com"]
 17     #redis_key=‘baoxian:start_urls‘
 18     keywords=软件测试
 19     from urllib import quote
 20     urlencode_keywords=quote(keywords)
 21 
 22     start_urls = [https://www.zhihu.com/r/search?q=+urlencode_keywords+&type=content&offset=0] #‘https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=0‘
 23     def start_requests(self):
 24         for url in self.start_urls:
 25             yield Request(url=url, callback=self.parse,dont_filter=True)
 26 
 27     def parse(self, response):
 28         body=response.body  #{"paging":{"next":"\/r\/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=50"},"htmls"
 29         #print body
 30 
 31         #获取问题链接
 32         question_href_reg=r<div class=\\"title\\"><a target=\\"_blank\\" href=http://"\\/question\\/(.*?)\\"
 33         all_question_href=http://www.mamicode.com/re.findall(question_href_reg,body)
 34         print all_question_href:,all_question_href
 35         for aqh in all_question_href:
 36             question_href=http://www.mamicode.com/https://www.zhihu.com/question/+str(aqh)
 37             yield Request(url=question_href, callback=self.parse_question,dont_filter=True)
 38             print question_href
 39 
 40             log.msg("question_href:%s \n list_question_page:%s"%(question_href,response.url), level=log.INFO)
 41             #self.log
 42         #获取下一页的链接
 43 
 44         reg=r{"paging":{"next":"(\\/r\\/search\?q=.*?&type=content&offset=.*?)"},"htmls"
 45         next_page=re.findall(reg,body)
 46         print 下一页问题:,next_page
 47         if len(next_page):
 48             #print next_page[0]   #https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=10
 49             next_page_url=https://www.zhihu.com+ next_page[0].replace(\\,‘‘)
 50             print next_page_url:,next_page_url
 51             yield Request(url=next_page_url, callback=self.parse,dont_filter=True)
 52             log.msg("next_page_url:%s"%next_page_url, level=log.INFO)
 53 
 54                                            #data-type=\"Answer\"><div class=\"title\"><a target=\"_blank\" href=http://www.mamicode.com/"\/question\/22316395\"
 55 
 56 
 57     def parse_question(self,response):                             ####问题详情页面
 58         #print response.body
 59 
 60         print response.url:,response.url
 61         title=response.xpath(//h1[@class="QuestionHeader-title"]/text()).extract_first()
 62         print time.strftime(%Y-%m-%d %H:%M:%S,time.localtime(time.time()))
 63         print title:,title
 64         #editableDetail&quot;:&quot;,国内的保险员说风险太大,不受法律保护什么的。大神推荐我赴港买保险吗?&quot;,&quot;visitCount&quot
 65         reg=editableDetail&quot;:&quot;([\s\S]*?)&quot;,&quot;visitCount&quot
 66         content_match=re.findall(reg,response.body)
 67         if  content_match:
 68             content=content_match[0]
 69         else:
 70             content=‘‘               #有可能问题无具体描述
 71         print content:,content
 72         question={}
 73         question[url]=response.url
 74         question[title]=title
 75 
 76         question[content]=content
 77         #https://www.zhihu.com/question/19904068
 78         question[comment]=[]
 79         #https://www.zhihu.com/api/v4/questions/20214716/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=3&offset=3
 80         answer_json=https://www.zhihu.com/api/v4/questions/+re.findall((\d+),response.url)[0]+/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0
 81         print answer_json:,answer_json
 82         yield Request(url=answer_json, callback=self.parse_json,meta=question,dont_filter=False)
 83         """
 84         item=ZhihuItem()
 85         item[‘title‘]=question[‘title‘]
 86         item[‘url‘]=question[‘url‘]
 87         item[‘content‘]=question[‘content‘]
 88         yield item
 89         print item
 90         """
 91 
 92     def parse_json(self,response):                           ####答案列表
 93         meta=response.meta
 94         dict=json.loads(response.body)
 95 
 96         #print ‘dict:‘,dict
 97         print dcit to json:,json.dumps(dict,ensure_ascii=False)
 98         comment_list=meta[comment]
 99         for data  in  dict[data]:                    # dict[‘data‘]是列表,每个元素是字典
100             try:
101                 comment_dict={}
102                 comment_dict[comment_content]=data[content]
103                 if data[author][name]:
104                     comment_dict[author]=data[author][name]
105                 else:
106                     comment_dict[author]=‘‘
107                 comment_dict[voteup_count]=data[voteup_count]
108                 comment_dict[comment_count]=data[comment_count]
109                 comment_dict[comment_time]=time.strftime(%Y-%m-%d,time.localtime(data[created_time]))
110                 comment_list.append(comment_dict)
111             except Exception,e:
112                 print e
113         meta[comment]=comment_list
114         meta[answer_num]=dict[paging][totals]
115 
116 
117 
118         if dict[paging][is_end]==False:
119             yield Request(url=dict[paging][next], callback=self.parse_json,meta=meta,dont_filter=False)
120         else:
121             #log.msg("last:%s"%next_page_url, level=log.INFO)
122             print last:,meta[title],meta[url] ,meta[content],meta[answer_num],len(meta[comment])#,meta[‘comment‘]
123             item=ZhihuItem()
124             item[title]=meta[title]
125             item[url]=meta[url]
126             item[content]=meta[content]
127             item[answer_num]=meta[answer_num]
128             item[comment]=meta[comment]
129             yield item

 

scrapy 知乎关键字爬虫spider代码