首页 > 代码库 > nodejs 爬虫
nodejs 爬虫
参考了各位大大的,然后自己写了个爬虫
用到的modules:
utils.js --- moment
module_url.js
var http = require("http"); //获得页面数据
var cheerio = require("cheerio"); //分析页面数据,提取内容
var sanitize = require("validator"); //过滤没用的数据 如空格等
var fs = require(‘fs‘); //操作文件,保存结果
app.js
var async = require("async"); //异步操作 如each, filter
var ts = require("timespans") //计算花费时间
var sanitize = require("validator"); //过滤没用的数据 如空格等
获得每个页面的话题列表 -- 并行的
根据话题列表获得的话题具体内容 -- 并行的 但是最后输出的内容是按顺序的
别处拷来的utils 里面重写了下console.log 增加了输出的时间
var moment = require(‘moment‘);exports.inc = function(n, callback, timeout) { timeout = timeout || 200; setTimeout(function() { callback(null, n+1); }, timeout);};exports.fire = function(obj, callback, timeout) { timeout = timeout || 200; setTimeout(function() { callback(null, obj); }, timeout);};exports.err = function(errMsg, callback, timeout) { timeout = timeout || 200; setTimeout(function() { callback(errMsg); }, timeout);};// utilsexports.log = function(msg, obj) { process.stdout.write(moment().format(‘ss.SSS‘)+‘> ‘); if(obj!==undefined) { process.stdout.write(msg); console.log(obj); } else { console.log(msg); }};exports.wait = function(mils) { var now = new Date; while(new Date - now <= mils);}
抓取页面数据
//获得页面数据var http = require("http");//分析页面数据,提前内容var cheerio = require("cheerio");//过滤没用的数据 如空格等var sanitize = require("validator");//操作文件,保存结果var fs = require(‘fs‘);var scrapy = {};scrapy.get = function(url, callback) { http.get(url, function(res) { var size = 0; var chunks = []; res.on(‘data‘, function(chunk) { size += chunk.length; chunks.push(chunk); }); res.on(‘end‘, function() { var data =http://www.mamicode.com/ Buffer.concat(chunks, size); callback(null, data); }); }).on(‘error‘, function(e) { callback(e, null); });}var getPage = function(pageUrl, callback){ scrapy.get(pageUrl, function(err, data){ if(err){ callback(err); } var html = data.toString(); $ = cheerio.load(html); //title link, link to detail page var news = $(‘.cell .topic_title_wrapper a‘); callback(null, news); });}var getDetail = function(detailUrl, callback){ scrapy.get(detailUrl, function(err, data){ if(err){ callback(err); } var html = data.toString(); $ = cheerio.load(html); var item = {}; item.href = detailUrl; $(‘.header .topic_full_title .put_top‘).remove(); //删除 “置顶” item.title = sanitize.escape(sanitize.trim($(‘.header .topic_full_title‘).text())); item.content = sanitize.escape(sanitize.trim($(‘.inner.topic .topic_content‘).text())); callback(null, item); });}var save = function(fileName, data) { var result = JSON.stringify(data); fs.writeFileSync(fileName, result);}exports.getUrl = scrapy.get;exports.getPage = getPage;exports.getDetail = getDetail;exports.save = save;
主文件
//自定义console.log 加入了输出时间var utils = require("./utils");var log = utils.log;//异步操作 如each, filtervar async = require("async");//计算花费时间var ts = require("timespans")//过滤没用的数据 如空格等var sanitize = require("validator");var url = require("./module_url")var baseUrl = ‘http://cnodejs.org‘;var pageUrl = baseUrl + ‘/?page=‘;var isOnlyTitle = true;var pages = [];for (var i = 1; i < 4; i++) { pages.push(i);};ts.start();var titles = {};//page 之间并行async.forEach(pages, function(page, callback_each){ titles[page] = []; url.getPage(pageUrl + page, function(err, news){ if(err){ log("page error"); return; } if (news.length === 0) { log("no data for the page:" + page); return; } async.filter(news, function(index, callback){ var detailUrl = baseUrl + news[index].attribs[‘href‘]; if(isOnlyTitle){ var curNew = news[index]; var item = {}; item.href = detailUrl; $(curNew).find(".put_top").remove(); //删除 “置顶” item.title = sanitize.escape(sanitize.trim($(curNew).text())); titles[page][index] = item; callback(true); } else{ url.getDetail(detailUrl, function(err, item){ if(err){ log("detail error"); return; } titles[page][index] = item; //titles[page].push(item); callback(true); }); } }, function(result){ //log("filter news:", result); callback_each(null); }); });}, function(err){ ts.stop(); //ts.pause(); --- ts.continue(); console.log(‘total: %s pause: %s used: %s‘, ts.elapsedtime(), ts.pausetime(), ts.usedtime()); log(titles); //url.save("cnodejs.json", titles);});
另外:想实现抓取某个时间段内的话题,努力ing...
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。