首页 > 代码库 > python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子
python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子
面向过程的方式
#!/usr/bin/env python # -*- coding: utf-8 -*- import urllib2 import sys import re import os type = sys.getfilesystemencoding() if __name__ == ‘__main__‘: # 1.访问其中一个网页地址,获取网页源代码 url = ‘http://www.qiushibaike.com/textnew/‘ user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘ headers = {‘User-Agent‘: user_agent} try: req = urllib2.Request(url=url, headers=headers) res = urllib2.urlopen(req) html = res.read().decode("UTF-8").encode(type) except urllib2.HTTPError as e: print e exit() except urllib2.URLError as e: print e exit() # 2.根据抓取到的网页源代码去提取想要的数据,帖子id,帖子内容 regex_content = re.compile( ‘<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>‘, re.S) items = re.findall(regex_content, html) for item in items: file_name = item[0].strip(‘\‘‘) content = item[1].strip().lstrip(‘<span>‘).rstrip(‘</span>‘).replace(‘\n‘, ‘‘).replace( ‘<br/>‘, ‘\n‘) # 3.保存抓取的数据到文件中 path = ‘qiubai‘ if not os.path.exists(path): os.makedirs(path) file_path = path + ‘/‘ + file_name + ‘.txt‘ with open(file_path, ‘w‘) as fp: fp.write(content) fp.close()
面向对象的方式
#!/usr/bin/env python # -*- coding: utf-8 -*- import urllib2 import re import os import sys type = sys.getfilesystemencoding() class Spider: def __init__(self): self.url = ‘http://www.qiushibaike.com/textnew/page/%s/?s=4979315‘ self.user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36‘ # 获取网页源代码 def get_page(self, page_index): headers = {‘User-Agent‘: self.user_agent} try: req = urllib2.Request(url=self.url % str(page_index), headers=headers) res = urllib2.urlopen(req) html = res.read().decode("UTF-8").encode(type) return html except urllib2.HTTPError as e: print e exit() except urllib2.URLError as e: print e exit() # 分析网页源代码 def analysis(self, html): regex_content = re.compile( ‘<div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>‘, re.S) items = re.findall(regex_content, html) return items # 保存抓取的数据到文件中 def save(self, items, path): if not os.path.exists(path): os.makedirs(path) for item in items: file_name = item[0].strip(‘\‘‘) content = item[1].strip().lstrip(‘<span>‘).rstrip(‘</span>‘).replace(‘\n‘, ‘‘).replace( ‘<br/>‘, ‘\n‘) file_path = path + ‘/‘ + file_name + ‘.txt‘ with open(file_path, ‘w‘) as fp: fp.write(content) fp.close() # 运行的方法 def run(self): print u‘开始抓取内容...‘ for i in range(1, 3): content = self.get_page(i) items = self.analysis(content) self.save(items, ‘qiubai‘) print u‘内容抓取完毕...‘ if __name__ == ‘__main__‘: sp = Spider() sp.run()
python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。