首页 > 代码库 > scrapy 知乎关键字爬虫spider代码
scrapy 知乎关键字爬虫spider代码
以下是spider部分的代码。爬知乎是需要登录的,建议使用cookie就可以了,如果需要爬的数量预计不多,请不要使用过大的线程数量,否则会过快的被封杀,需要等十几个小时账号才能使用,得不偿失。
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy import Request 4 from scrapy import log 5 import logging 6 #from zhihu.items import ZhihuItem 7 from zhihu.items import ZhihuItem 8 from scrapy_redis.spiders import RedisSpider 9 import re 10 import json 11 import time 12 13 class BaoxianSpider(RedisSpider): 14 15 name = "baoxian" 16 allowed_domains = ["zhihu.com"] 17 #redis_key=‘baoxian:start_urls‘ 18 keywords=‘软件测试‘ 19 from urllib import quote 20 urlencode_keywords=quote(keywords) 21 22 start_urls = [‘https://www.zhihu.com/r/search?q=‘+urlencode_keywords+‘&type=content&offset=0‘] #‘https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=0‘ 23 def start_requests(self): 24 for url in self.start_urls: 25 yield Request(url=url, callback=self.parse,dont_filter=True) 26 27 def parse(self, response): 28 body=response.body #{"paging":{"next":"\/r\/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=50"},"htmls" 29 #print body 30 31 #获取问题链接 32 question_href_reg=r‘<div class=\\"title\\"><a target=\\"_blank\\" href=http://"\\/question\\/(.*?)\\"‘ 33 all_question_href=http://www.mamicode.com/re.findall(question_href_reg,body) 34 print ‘all_question_href:‘,all_question_href 35 for aqh in all_question_href: 36 question_href=http://www.mamicode.com/‘https://www.zhihu.com/question/‘+str(aqh) 37 yield Request(url=question_href, callback=self.parse_question,dont_filter=True) 38 print question_href 39 40 log.msg("question_href:%s \n list_question_page:%s"%(question_href,response.url), level=log.INFO) 41 #self.log 42 #获取下一页的链接 43 44 reg=r‘{"paging":{"next":"(\\/r\\/search\?q=.*?&type=content&offset=.*?)"},"htmls"‘ 45 next_page=re.findall(reg,body) 46 print ‘下一页问题:‘,next_page 47 if len(next_page): 48 #print next_page[0] #https://www.zhihu.com/r/search?q=%E4%BF%9D%E9%99%A9&type=content&offset=10 49 next_page_url=‘https://www.zhihu.com‘+ next_page[0].replace(‘\\‘,‘‘) 50 print ‘next_page_url:‘,next_page_url 51 yield Request(url=next_page_url, callback=self.parse,dont_filter=True) 52 log.msg("next_page_url:%s"%next_page_url, level=log.INFO) 53 54 #data-type=\"Answer\"><div class=\"title\"><a target=\"_blank\" href=http://www.mamicode.com/"\/question\/22316395\" 55 56 57 def parse_question(self,response): ####问题详情页面 58 #print response.body 59 60 print ‘response.url:‘,response.url 61 title=response.xpath(‘//h1[@class="QuestionHeader-title"]/text()‘).extract_first() 62 print time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime(time.time())) 63 print ‘title:‘,title 64 #editableDetail":",国内的保险员说风险太大,不受法律保护什么的。大神推荐我赴港买保险吗?","visitCount" 65 reg=‘editableDetail":"([\s\S]*?)","visitCount"‘ 66 content_match=re.findall(reg,response.body) 67 if content_match: 68 content=content_match[0] 69 else: 70 content=‘‘ #有可能问题无具体描述 71 print ‘content:‘,content 72 question={} 73 question[‘url‘]=response.url 74 question[‘title‘]=title 75 76 question[‘content‘]=content 77 #https://www.zhihu.com/question/19904068 78 question[‘comment‘]=[] 79 #https://www.zhihu.com/api/v4/questions/20214716/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=3&offset=3 80 answer_json=‘https://www.zhihu.com/api/v4/questions/‘+re.findall(‘(\d+)‘,response.url)[0]+‘/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0‘ 81 print ‘answer_json:‘,answer_json 82 yield Request(url=answer_json, callback=self.parse_json,meta=question,dont_filter=False) 83 """ 84 item=ZhihuItem() 85 item[‘title‘]=question[‘title‘] 86 item[‘url‘]=question[‘url‘] 87 item[‘content‘]=question[‘content‘] 88 yield item 89 print item 90 """ 91 92 def parse_json(self,response): ####答案列表 93 meta=response.meta 94 dict=json.loads(response.body) 95 96 #print ‘dict:‘,dict 97 print ‘dcit to json:‘,json.dumps(dict,ensure_ascii=False) 98 comment_list=meta[‘comment‘] 99 for data in dict[‘data‘]: # dict[‘data‘]是列表,每个元素是字典 100 try: 101 comment_dict={} 102 comment_dict[‘comment_content‘]=data[‘content‘] 103 if data[‘author‘][‘name‘]: 104 comment_dict[‘author‘]=data[‘author‘][‘name‘] 105 else: 106 comment_dict[‘author‘]=‘‘ 107 comment_dict[‘voteup_count‘]=data[‘voteup_count‘] 108 comment_dict[‘comment_count‘]=data[‘comment_count‘] 109 comment_dict[‘comment_time‘]=time.strftime(‘%Y-%m-%d‘,time.localtime(data[‘created_time‘])) 110 comment_list.append(comment_dict) 111 except Exception,e: 112 print e 113 meta[‘comment‘]=comment_list 114 meta[‘answer_num‘]=dict[‘paging‘][‘totals‘] 115 116 117 118 if dict[‘paging‘][‘is_end‘]==False: 119 yield Request(url=dict[‘paging‘][‘next‘], callback=self.parse_json,meta=meta,dont_filter=False) 120 else: 121 #log.msg("last:%s"%next_page_url, level=log.INFO) 122 print ‘last:‘,meta[‘title‘],meta[‘url‘] ,meta[‘content‘],meta[‘answer_num‘],len(meta[‘comment‘])#,meta[‘comment‘] 123 item=ZhihuItem() 124 item[‘title‘]=meta[‘title‘] 125 item[‘url‘]=meta[‘url‘] 126 item[‘content‘]=meta[‘content‘] 127 item[‘answer_num‘]=meta[‘answer_num‘] 128 item[‘comment‘]=meta[‘comment‘] 129 yield item
scrapy 知乎关键字爬虫spider代码
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。