首页 > 代码库 > python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子

python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子

面向过程的方式

#!/usr/bin/env python 
# -*- coding: utf-8 -*-

import urllib2
import sys
import re
import os

type = sys.getfilesystemencoding()
if __name__ == __main__:
    # 1.访问其中一个网页地址,获取网页源代码
    url = http://www.qiushibaike.com/textnew/
    user_agent = Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36
    headers = {User-Agent: user_agent}
    try:
        req = urllib2.Request(url=url, headers=headers)
        res = urllib2.urlopen(req)
        html = res.read().decode("UTF-8").encode(type)
    except urllib2.HTTPError as e:
        print e
        exit()
    except urllib2.URLError as e:
        print e
        exit()
    # 2.根据抓取到的网页源代码去提取想要的数据,帖子id,帖子内容
    regex_content = re.compile(
        <div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>,
        re.S)
    items = re.findall(regex_content, html)
    for item in items:
        file_name = item[0].strip(\‘)
        content = item[1].strip().lstrip(<span>).rstrip(</span>).replace(\n, ‘‘).replace(
            <br/>, \n)
        # 3.保存抓取的数据到文件中
        path = qiubai
        if not os.path.exists(path):
            os.makedirs(path)
        file_path = path + / + file_name + .txt
        with open(file_path, w) as fp:
            fp.write(content)
            fp.close()

面向对象的方式

#!/usr/bin/env python 
# -*- coding: utf-8 -*-
import urllib2
import re
import os
import sys

type = sys.getfilesystemencoding()


class Spider:
    def __init__(self):
        self.url = http://www.qiushibaike.com/textnew/page/%s/?s=4979315
        self.user_agent = Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36

    # 获取网页源代码
    def get_page(self, page_index):
        headers = {User-Agent: self.user_agent}
        try:
            req = urllib2.Request(url=self.url % str(page_index), headers=headers)
            res = urllib2.urlopen(req)
            html = res.read().decode("UTF-8").encode(type)
            return html
        except urllib2.HTTPError as e:
            print e
            exit()
        except urllib2.URLError as e:
            print e
            exit()

    # 分析网页源代码
    def analysis(self, html):
        regex_content = re.compile(
            <div class="article block untagged mb15" id=(.*?)>(?:.*?)<div class="content">(.*?)</div>,
            re.S)
        items = re.findall(regex_content, html)
        return items

    # 保存抓取的数据到文件中
    def save(self, items, path):
        if not os.path.exists(path):
            os.makedirs(path)
        for item in items:
            file_name = item[0].strip(\‘)
            content = item[1].strip().lstrip(<span>).rstrip(</span>).replace(\n, ‘‘).replace(
                <br/>, \n)
            file_path = path + / + file_name + .txt
            with open(file_path, w) as fp:
                fp.write(content)
                fp.close()

    # 运行的方法
    def run(self):
        print u开始抓取内容...
        for i in range(1, 3):
            content = self.get_page(i)
            items = self.analysis(content)
            self.save(items, qiubai)
        print u内容抓取完毕...


if __name__ == __main__:
    sp = Spider()
    sp.run()

 

python 爬虫004-使用urllib2与正则表达式扒取糗事百科新鲜页首页帖子