首页 > 代码库 > 运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中

运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中

原文地址:运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中



依赖包:

1.jieba

2.pymongo

3.HTMLParser

# -*- coding: utf-8 -*-
"""
@author: jiangfuqiang
"""

from HTMLParser import  HTMLParser
import re
import time
from datetime import  date
import pymongo
import urllib2
import sys
import traceback
import jieba

default_encoding = ‘utf-8‘
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)
isExist = False

class FetchCnblog(HTMLParser):
    def __init__(self, id):
        HTMLParser.__init__(self)
        self.result = []
        self.data = {}
        self.isTitleLink = False
        self.id = id
        self.isSummary = False
        self.isPostItem = False
        self.isArticleView = False


    def handle_data(self, data):
        if self.isTitleLink and self.isPostItem:
            self.data[‘title‘] = data
            self.isTitleLink = False
        elif self.isSummary and self.isPostItem:
            data = data.strip()
            if data:
                self.data[‘desc‘] = data


    def handle_starttag(self, tag, attrs):
        if tag == ‘a‘:
            for key, value in attrs:
                if key == ‘class‘:
                    if value == ‘titlelnk‘:
                        self.isTitleLink = True
                    elif value == ‘gray‘ and self.isArticleView:
                        self.isArticleView = False
                        for key, value in attrs:
                            if key == ‘href‘:
                                self.data[‘readmoreLink‘] = value
                                reg = ‘d+‘
                                result = re.search(reg,value)
                                self.isPostItem = False

                                if result:
                                    self.data[‘id‘] = int(result.group())
                                else:
                                    self.data = {}
                                    return
                                if self.data[‘id‘] <= self.id:
                                    self.data = {}
                                    isExist = True
                                    return
                                else:
                                    self.data[‘srouce‘] = "www.cnblogs.com"
                                    self.data[‘source_key‘] = ‘cnblogs‘
                                    self.data[‘fetchTime‘] = str(date.today())
                                    self.data[‘keyword‘] = ",".join(jieba.cut(self.data[‘title‘]))
                                    self.result.append(self.data)
                                    self.data = http://www.mamicode.com/{}"__main__":
    con = pymongo.Connection(‘localhost‘, 27017)
    db = con.blog
    fetchblog = db.fetch_blog
    record = db.record
    url = "http://www.cnblogs.com/sitehome/p/%d"
    count = 1
    flag = False
    headers={
             ‘User-Agent‘:‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US。 rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘}
    reco = record.find_one({"type":‘cnblogs‘})
    id = 0
    if reco:
        id = reco[‘maxId‘]
    while isExist == False:
        try:
            req = urllib2.Request(url%count,headers=headers)
            request = urllib2.urlopen(req)
            data = http://www.mamicode.com/request.read()"type":‘cnblogs‘},{"$set":{‘maxId‘:id}},True,False)
                result.reverse()
                for doc in result:
                    fetchblog.insert(doc)
                print "page is %d"%count
                count += 1

                time.sleep(5)
        except Exception, e:
            traceback.print_exc()
            print "parse error",e

程序假设在linux,mac下运行。在可在crontab -e中设置定时任务,假设在windows运行,则自己再在程序里加个定时器就可以


运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中