首页 > 代码库 > requests+xpath+map爬取百度贴吧

requests+xpath+map爬取百度贴吧

 1 # requests+xpath+map爬取百度贴吧
 2 # 目标内容:跟帖用户名,跟帖内容,跟帖时间
 3 # 分解:
 4 # requests获取网页
 5 # xpath提取内容
 6 # map实现多线程爬虫
 7 import requests
 8 from requests.exceptions import RequestException
 9 from lxml import etree
10 import json
11 from multiprocessing.dummy import Pool as ThreadPool
12 
13 def get_html(url):
14     try:
15         response = requests.get(url)
16         if response.status_code == 200:
17             return response.text
18         else:
19             return None
20     except RequestException:
21         return None
22 
23 def parse_html(html):
24     selector = etree.HTML(html)
25     data = http://www.mamicode.com/selector.xpath(//div[@class="l_post j_l_post l_post_bright  "])
26     for each in data:
27         rs = each.xpath(@data-field)[0]
28         rs = json.loads(rs)
29         author = rs.get(author).get(user_name)
30         author_id = rs.get(content).get(post_id)
31         content = each.xpath(div/div/cc/div[@id="post_content_%s"]/text()% author_id)[0].strip()
32         date = rs.get(content).get(date)
33         yield {
34             author:author,
35             content:content,
36             date:date
37         }
38 
39 def save_to_txt(result):
40     print(正在存储:,result)
41 
42     with open(tieba.txt,a,encoding=utf-8) as f:
43         f.write(回帖作者:+result[author]+\n)
44         f.write(回帖内容:+result[content]+\n)
45         f.write(回帖时间:+result[date]+\n)
46         f.write(\n)
47 
48 
49 def main(url):
50         html = get_html(url)
51         if html:
52             for result in parse_html(html):
53                 save_to_txt(result)
54 
55 if __name__==__main__:
56     
57     pool = ThreadPool(4)
58     urls=[]
59     base_url = http://tieba.baidu.com/p/3522395718?pn=
60     for page_num in range(1, 21):
61         url = base_url + str(page_num)
62         urls.append(url)
63 
64     pool.map(main,urls)
65     pool.close()
66     pool.join()

 

requests+xpath+map爬取百度贴吧