首页 > 代码库 > 运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中
运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中
原文地址:运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中
依赖包: 1.jieba 2.pymongo 3.HTMLParser # -*- coding: utf-8 -*- """ @author: jiangfuqiang """ from HTMLParser import HTMLParser import re import time from datetime import date import pymongo import urllib2 import sys import traceback import jieba default_encoding = ‘utf-8‘ if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding) isExist = False class FetchCnblog(HTMLParser): def __init__(self, id): HTMLParser.__init__(self) self.result = [] self.data = {} self.isTitleLink = False self.id = id self.isSummary = False self.isPostItem = False self.isArticleView = False def handle_data(self, data): if self.isTitleLink and self.isPostItem: self.data[‘title‘] = data self.isTitleLink = False elif self.isSummary and self.isPostItem: data = data.strip() if data: self.data[‘desc‘] = data def handle_starttag(self, tag, attrs): if tag == ‘a‘: for key, value in attrs: if key == ‘class‘: if value == ‘titlelnk‘: self.isTitleLink = True elif value == ‘gray‘ and self.isArticleView: self.isArticleView = False for key, value in attrs: if key == ‘href‘: self.data[‘readmoreLink‘] = value reg = ‘d+‘ result = re.search(reg,value) self.isPostItem = False if result: self.data[‘id‘] = int(result.group()) else: self.data = {} return if self.data[‘id‘] <= self.id: self.data = {} isExist = True return else: self.data[‘srouce‘] = "www.cnblogs.com" self.data[‘source_key‘] = ‘cnblogs‘ self.data[‘fetchTime‘] = str(date.today()) self.data[‘keyword‘] = ",".join(jieba.cut(self.data[‘title‘])) self.result.append(self.data) self.data = http://www.mamicode.com/{}"__main__": con = pymongo.Connection(‘localhost‘, 27017) db = con.blog fetchblog = db.fetch_blog record = db.record url = "http://www.cnblogs.com/sitehome/p/%d" count = 1 flag = False headers={ ‘User-Agent‘:‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US。 rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘} reco = record.find_one({"type":‘cnblogs‘}) id = 0 if reco: id = reco[‘maxId‘] while isExist == False: try: req = urllib2.Request(url%count,headers=headers) request = urllib2.urlopen(req) data = http://www.mamicode.com/request.read()"type":‘cnblogs‘},{"$set":{‘maxId‘:id}},True,False) result.reverse() for doc in result: fetchblog.insert(doc) print "page is %d"%count count += 1 time.sleep(5) except Exception, e: traceback.print_exc() print "parse error",e 程序假设在linux,mac下运行。在可在crontab -e中设置定时任务,假设在windows运行,则自己再在程序里加个定时器就可以
运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。