首页 > 代码库 > nodejs写的一个网页爬虫例子(坏链率)

nodejs写的一个网页爬虫例子(坏链率)

    因为工作需要,用nodejs写了个简单的爬虫例子,之前也没用过nodejs,连搭环境加写大概用了5天左右,so。。。要多简陋有多简陋,放这里给以后的自己看~~

    整体需求是:给一个有效的URL地址,返回该网页上所有无效链接的百分比(坏链率)

    第一个文件:计算环链率 urlSpider.js   

  1 /*================================================
  2 @author MissUU
  3 链接抓取思路:
  4 
  5 1. 获取页面内容
  6 2. 正则取得所有<a>
  7 3. 进一步取得href属性值,如果首位是“则剔除,不是http开头加上域名(javascript开头除外)
  8 4.正则验证是否是常见URL格式
  9 ================================================*/
 10 var http = require(‘http‘);
 11 var async = require(‘async‘);
 12 var dbHandle = require(‘./dbHandle.js‘);
 13 
 14 //主程序
 15 var runUrlSpider = function(obj, callback){
 16     //10s timeout
 17     var request_timer = setTimeout(function() {
 18                         req.abort();
 19                         console.log(‘Request Timeout.‘);
 20                         }, 10000);
 21     
 22     var urlBadLink = new UrlBadLink();
 23     var html=‘‘;   
 24     var req = http.get(obj.url, function(res) {
 25 
 26         clearTimeout(request_timer);
 27         
 28         res.setEncoding(‘utf8‘);
 29         res.on(‘data‘, function (chunk) {
 30         html += chunk;
 31         }).on(‘end‘, function(){
 32            console.log(‘*******开始提取有效链接地址******‘);
 33            console.log(new Date().toLocaleString());
 34            console.log(obj.url);
 35            urlBadLink.host = obj.url;
 36            urlBadLink.id = obj.id;
 37            matchURL(html, urlBadLink, function(){
 38                callback();
 39            });
 40            });
 41      });
 42 
 43      req.on(‘error‘, function(e) {
 44          console.log(‘problem with request: ‘ + e.message);
 45          callback();
 46      });
 47 }
 48 
 49 //this is the entrance of code
 50 var main = function(){
 51      var urlArray = dbHandle.showUrls(1, function(result){
 54      async.eachSeries(result, runUrlSpider, function(err){
 55          console.log(‘******this is the end, haha*******‘);
 56     });
 57    });
 58   //  console.log(urlArray);
 59     
 60 };
 61 
 62 main();
 63 
 64 /*
 65 * 用于异步放送get请求
 66 *
 67 * @param {string} content 原始页面信息
 68 * @param {string} host 主域名
 69 */
 70 function matchURL(content, urlBadLink, callend){
 71    var host = urlBadLink.host;
 72    var anchor = /<a\s[^>]*>/g;
 73    var matches = content.match(anchor);
 74    var badLink = 0;
 75    var flag = 0;
 76    var HttpGet = function(url,callback){
 77         //10s timeout
 78        var request_timer = setTimeout(function() {
 79                            req.abort();
 80                            console.log(‘Request Timeout.‘);
 81                            }, 10000);
 82     
 83        var req = http.get(url, function(res) {
 84             clearTimeout(request_timer);
 85            
 86             res.on(‘data‘, function () {         
 87             }).on(‘end‘, function(){
 88                console.log(++flag + ": " + url + ‘ response status: ‘ + res.statusCode);
 89           
 90                if(!(res.statusCode >= 200 && res.statusCode < 400)){
 91                console.log(‘-----------------------‘);
 92                badLink++;                  
 93                }
 94                
 95                callback();
 96               });         
 97              });
 98             req.on(‘error‘, function(err){
 99                console.log(++flag + ": " + ‘problem with request: ‘ + err.message);
100                badLink++;  
101                callback();
102            });
103       };
104 
105    var urls = filterUrl(matches,host);
106    
107    if(urls !== null){
108       var totalLink = urls.length;
109    //console.log(urls); 
110       async.eachSeries(urls, HttpGet, function(err){
111      // var urlBadLink = new UrlBadLink(host,totalLink, badLink);
112      // console.log("坏链个数为: " + urlBadLink.badCounts);
113      // console.log("坏链率为: " + urlBadLink.getRate());  
114            urlBadLink.total = totalLink;
115            urlBadLink.badCounts = badLink;
116       //data store puts here
117            dbHandle.updateBadLink(urlBadLink);            
118            callend();    
119    });
120   }else{
121         console.log(‘no links found‘);
122         urlBadLink.total = 10;
123         urlBadLink.badCounts = 0;
124         dbHandle.updateBadLink(urlBadLink);
125         callend();
126   }
127 }
128 
129 //正则取得href属性值
130 function URLFommat(strUrl,host){
131 
132    var urlPatten = /href=http://www.mamicode.com/[/‘/"]?([^\‘\"]*)[\‘\"]?/i;
133    var temp = urlPatten.exec(strUrl);
134 
135    if(temp!= null){
136    var url = temp[0].substring(6,temp[0].length-1).trim();
137 
138       if(url.indexOf("\"") != -1){
139          url = url.slice(url.indexOf("\"") + 1);
140       }
141 
142       if(url.charAt(0) == "/"){  
143          url = url.slice(1);
144          return host + url;
145       }else if((url.indexOf("http") == -1)&&
146           (url.indexOf("javascript") == -1)){
147          return host + url;
148           }else
149               return url;
150    }else 
151      return null;
152 }
153 
154 //
155 function URLFommat1(strUrl,host){
156 
157    var urlPatten = /href=http://www.mamicode.com/[/‘/"]?([^\‘\"]*)[\‘\"]?/i;
158    var temp = urlPatten.exec(strUrl);
159 
160    if(temp!= null){
161    var url = temp[0].substring(6,temp[0].length-1).trim();
162 
163       if(url.indexOf("\"") != -1)
164          url = url.slice(url.indexOf("\"") + 1);
165 
166       if(url.charAt(0) == "/")
167          return "http://" + host + url;
168       else if((url.indexOf("http") == -1)&&
169           (url.indexOf("javascript") == -1)){
170          return "http://" + host+"/" + url;
171           }else
172               return url;
173    }else 
174      return null;
175 }
176 //test URLFommat
177  //var test = "http://baidu.com";
178 // var test1 = " \"http://baidu.com";
179  //var test2 = "/wenhao";
180 //console.log(URLFommat(test,"www.sina.com.cn"));
181  //console.log(URLFommat(test1,"www.sina.com.cn"));
182  //console.log(URLFommat(test2,"www.sina.com.cn"));
183 
184 
185 //测试是否为常见url格式
186 function IsURL(strUrl) {
187    if(strUrl != null){
188     var regular = /^\b(((http?|ftp):\/\/)?[-a-z0-9]+(\.[-a-z0-9]+)*\.(?:com|edu|gov|int|mil|net|org|biz|info|name|museum|asia|coop|aero|[a-z][a-z]|((25[0-5])|(2[0-4]\d)|(1\d\d)|([1-9]\d)|\d))\b(\/[-a-z0-9_:\@&?=+,.!\/~%\$]*)?)$/i;
189     if (regular.test(strUrl)) {
190         return true;
191     }
192     else {
193         return false;
194     }
195    }else
196         return false;
197 }
198 
199 
200 //对象
201 function UrlBadLink(id, host, total, badCounts){
202     this.id = id;
203     this.host = host;
204     this.total = total;
205     this.badCounts = badCounts;
206 
207     if(typeof this.getRate != "function"){
208        UrlBadLink.prototype.getRate = function(){
209          var output = Number(Math.round(this.badCounts/this.total*10000)/100).toFixed(2)+‘%‘;
210          return output;
211        };
212     }
213 }
214 
215 function filterUrl(arr,host){
216 
217      if(arr === null)
218         return null;
219      var output = [];
220      arr.forEach(function(item,index,array){
221        //console.log(item);
222        var formatURL = URLFommat(item,host);
223        
224        if(IsURL(formatURL)){
225         output.push(formatURL);
226         }//if
227      });//forEach
228      
229      return output;
230 }

      第二个文件:将数据存库,dbHandle.js

/** 
 * @author MissUU
 * @des MySql基本操作 
 * API:     https://github.com/felixge/node-mysql 
 */  
  
var mysql = require(‘mysql‘);  
  
mysql.createConnection(‘mysql://root:apple@localhost/test?debug=false‘);  
  
var pool  = mysql.createPool({  
  host     : ‘10.102.1.00‘,  
  user     : ‘root‘,  
  password : ‘root‘,  
  database : ‘test‘,  
  connectionLimit: 15  
});  


//读取urls
exports.showUrls = function (groupId, callback){
  
  console.log(‘this is showUrl()‘);
  pool.getConnection(function(err, conn){

      if (err) {
        console.log("connection error!");
        console.log(err);
      }

      conn.query(‘SELECT id,realurl as url FROM t_site WHERE siteGroupId = ?‘,groupId, function(err, result){
          if(err){
             console.log(err.message);
          }

          conn.release();
          if(result.length){
            // console.log(result instanceof Array);          
              callback(result);
              return true;           
          }else{
             callback(‘‘);
             return false;
          }
      });
  });
}; 

exports.updateBadLink = function (urlBadLink){
  //若不含数据则不插入
  if (!!urlBadLink) {

     pool.getConnection(function(err, conn){
       
     if (err) {
        console.log("connection error!");
        console.log(err);
      }
      
      var updateSql = "UPDATE a_qualityinfo SET brokenRate = ‘"+ urlBadLink.getRate() +"‘ WHERE siteId = " + urlBadLink.id;
     
      console.log(updateSql);

      conn.query(updateSql, function(err, result){
          if(err){
             console.log(err.message);
             console.log(‘update fail‘);
          }

          conn.release();
          console.log(‘update success‘);
      });// conn.query
     });//pool.getConnection
  } 
};

     代码后期还会改动,这里有几点需要注意的:

     1、http.get有时会一直等待响应,所以一定要判断下,超时则认为出错,要不程序就卡住了。。。= =!

     2、注意callback的使用,要不然很难规范执行顺序的,用过nodejs的都懂得。。。