首页 > 代码库 > 爬虫:爬取海词的翻译内容

爬虫:爬取海词的翻译内容

在爬取海词的时候遇到了一个问题,在异步加载的时候,需要一个t值,但是这个t值是js加载出来的,直接拼接的,我们无法从网页中得到;

 当在搜索框输入单词的时候:你在干嘛

技术分享

替换下图中的page的值就能达到翻页的目的:

技术分享

那么当前的目的就是要能够找到这段js代码,同时获取对应输入的t的值,来重新组合url

真正的url只需要如下内容:

我提前把关键字和t都处理了,写成了字典的形式,

key:你在干嘛  ff[key]:WuqarCRs

{“你好”:“WuqarCRs”}  #提前处理成了这种,方便提取url = "http://fuzz.dict.cn/dict/api.php?&action=fuzz&from=jsonp&q=" + key + "&t="+ ff[key]+"&page="

 

那么关键部分来了,我是如何获取T的呢。

大概思路,是在本地搭建一个服务器,然后输入每个词去访问这段js代码,然后返回给词的结果保存起来。

处理过程的代码:

技术分享

 

第一步:先找到那段js代码,里面是如何把输入的文字转换成8位字符串的算法

技术分享

第二步:先安装node.js 服务器,然后提取出来这段js代码,转换成node.js代码,如果不转换的话在浏览器里面直接访问时无法触发js加载的。

下面是node.js的代码,先执行node.js代码

技术分享
var http = require(‘http‘);var querystring = require(‘querystring‘);var util = require(‘util‘);http.createServer(function(req, res){    var post = ‘‘;         var mm = ‘‘        function dictCrypto(J) {  function r(g, f) {    var e, d, a, b, c;    a = g & 2147483648;    b = f & 2147483648;    e = g & 1073741824;    d = f & 1073741824;    c = (g & 1073741823) + (f & 1073741823);    if (e & d) {      return c ^ 2147483648 ^ a ^ b    }    return e | d ? c & 1073741824 ? c ^ 3221225472 ^ a ^ b : c ^ 1073741824 ^ a ^ b : c ^ a ^ b  }  function I(g, f, e, d, a, b, c) {    g = r(g, r(r(f & e | ~f & d, a), c));    return r(g << b | g >>> 32 - b, f)  }  function s(g, f, e, d, a, b, c) {    g = r(g, r(r(f & d | e & ~d, a), c));    return r(g << b | g >>> 32 - b, f)  }  function w(g, f, e, d, a, b, c) {    g = r(g, r(r(f ^ e ^ d, a), c));    return r(g << b | g >>> 32 - b, f)  }  function v(g, f, e, d, a, b, c) {    g = r(g, r(r(e ^ (f | ~d), a), c));    return r(g << b | g >>> 32 - b, f)  }  function K(c) {    for (var b = "++"; c > 0;) {      var a = c % 64;      b += a == 0 ? "+" : a == 1 ? "-" : a > 1 && a < 12 ? String.fromCharCode(a + 46) : a > 11 && a < 38 ? String.fromCharCode(a + 54) : String.fromCharCode(a + 59);      c = (c - a) / 64    }    return b.substr(b.length - 2, 2)  }  function H(d) {    var c = "",      b = "",      a;    for (a = 0; a <= 3; a++) {      b = d >>> a * 8 & 255;      b = "0" + b.toString(16);      c += b.substr(b.length - 2, 2)    }    return c  }  var x = [],G, L, q, p, F, E, D, C;  J = function(d) {      // var rrr = //;      // d = d.replace(rrr,"");      process.stdout.write(d +"***"+‘\n‘);      process.stdout.write(typeof rrr);    d = d.replace(/\r\n/g, "\n");    for (var c = "",b = 0; b < d.length; b++) {      var a = d.charCodeAt(b);      if (a < 128) {        c += String.fromCharCode(a)      } else {        if (a > 127 && a < 2048) {          c += String.fromCharCode(a >> 6 | 192)        } else {          c += String.fromCharCode(a >> 12 | 224);          c += String.fromCharCode(a >> 6 & 63 | 128)        }        c += String.fromCharCode(a & 63 | 128)      }    }    c += String.fromCharCode(80, 97, 83, 115);    if (global.dict_pagetoken) {      c += global.dict_pagetoken    }    return c  }(J);  x = function(g) {    var f, e = g.length;    f = e + 8;    for (var d = ((f - f % 64) / 64 + 1) * 16, a = Array(d - 1), b = 0, c = 0; c < e;) {      f = (c - c % 4) / 4;      b = c % 4 * 8;      a[f] |= g.charCodeAt(c) << b;      c++    }    a[(c - c % 4) / 4] |= 128 << c % 4 * 8;    a[d - 2] = e << 3;    a[d - 1] = e >>> 29;    return a  }(J);  F = 1732584193;  E = 4023233417;  D = 2562383102;  C = 271733878;  for (J = 0; J < x.length; J += 16) {    G = F;    L = E;    q = D;    p = C;    F = I(F, E, D, C, x[J + 0], 7, 3614090360);    C = I(C, F, E, D, x[J + 1], 12, 3905402710);    D = I(D, C, F, E, x[J + 2], 17, 606105819);    E = I(E, D, C, F, x[J + 3], 22, 3250441966);    F = I(F, E, D, C, x[J + 4], 7, 4118548399);    C = I(C, F, E, D, x[J + 5], 12, 1200080426);    D = I(D, C, F, E, x[J + 6], 17, 2821735955);    E = I(E, D, C, F, x[J + 7], 22, 4249261313);    F = I(F, E, D, C, x[J + 8], 7, 1770035416);    C = I(C, F, E, D, x[J + 9], 12, 2336552879);    D = I(D, C, F, E, x[J + 10], 17, 4294925233);    E = I(E, D, C, F, x[J + 11], 22, 2304563134);    F = I(F, E, D, C, x[J + 12], 7, 1804603682);    C = I(C, F, E, D, x[J + 13], 12, 4254626195);    D = I(D, C, F, E, x[J + 14], 17, 2792965006);    E = I(E, D, C, F, x[J + 15], 22, 1236535329);    F = s(F, E, D, C, x[J + 1], 5, 4129170786);    C = s(C, F, E, D, x[J + 6], 9, 3225465664);    D = s(D, C, F, E, x[J + 11], 14, 643717713);    E = s(E, D, C, F, x[J + 0], 20, 3921069994);    F = s(F, E, D, C, x[J + 5], 5, 3593408605);    C = s(C, F, E, D, x[J + 10], 9, 38016083);    D = s(D, C, F, E, x[J + 15], 14, 3634488961);    E = s(E, D, C, F, x[J + 4], 20, 3889429448);    F = s(F, E, D, C, x[J + 9], 5, 568446438);    C = s(C, F, E, D, x[J + 14], 9, 3275163606);    D = s(D, C, F, E, x[J + 3], 14, 4107603335);    E = s(E, D, C, F, x[J + 8], 20, 1163531501);    F = s(F, E, D, C, x[J + 13], 5, 2850285829);    C = s(C, F, E, D, x[J + 2], 9, 4243563512);    D = s(D, C, F, E, x[J + 7], 14, 1735328473);    E = s(E, D, C, F, x[J + 12], 20, 2368359562);    F = w(F, E, D, C, x[J + 5], 4, 4294588738);    C = w(C, F, E, D, x[J + 8], 11, 2272392833);    D = w(D, C, F, E, x[J + 11], 16, 1839030562);    E = w(E, D, C, F, x[J + 14], 23, 4259657740);    F = w(F, E, D, C, x[J + 1], 4, 2763975236);    C = w(C, F, E, D, x[J + 4], 11, 1272893353);    D = w(D, C, F, E, x[J + 7], 16, 4139469664);    E = w(E, D, C, F, x[J + 10], 23, 3200236656);    F = w(F, E, D, C, x[J + 13], 4, 681279174);    C = w(C, F, E, D, x[J + 0], 11, 3936430074);    D = w(D, C, F, E, x[J + 3], 16, 3572445317);    E = w(E, D, C, F, x[J + 6], 23, 76029189);    F = w(F, E, D, C, x[J + 9], 4, 3654602809);    C = w(C, F, E, D, x[J + 12], 11, 3873151461);    D = w(D, C, F, E, x[J + 15], 16, 530742520);    E = w(E, D, C, F, x[J + 2], 23, 3299628645);    F = v(F, E, D, C, x[J + 0], 6, 4096336452);    C = v(C, F, E, D, x[J + 7], 10, 1126891415);    D = v(D, C, F, E, x[J + 14], 15, 2878612391);    E = v(E, D, C, F, x[J + 5], 21, 4237533241);    F = v(F, E, D, C, x[J + 12], 6, 1700485571);    C = v(C, F, E, D, x[J + 3], 10, 2399980690);    D = v(D, C, F, E, x[J + 10], 15, 4293915773);    E = v(E, D, C, F, x[J + 1], 21, 2240044497);    F = v(F, E, D, C, x[J + 8], 6, 1873313359);    C = v(C, F, E, D, x[J + 15], 10, 4264355552);    D = v(D, C, F, E, x[J + 6], 15, 2734768916);    E = v(E, D, C, F, x[J + 13], 21, 1309151649);    F = v(F, E, D, C, x[J + 4], 6, 4149444226);    C = v(C, F, E, D, x[J + 11], 10, 3174756917);    D = v(D, C, F, E, x[J + 2], 15, 718787259);    E = v(E, D, C, F, x[J + 9], 21, 3951481745);    F = r(F, G);    E = r(E, L);    D = r(D, q);    C = r(C, p)  }  return function(d) {      var c = parseInt("0x" + d.substr(0, 3), 16),        b = parseInt("0x" + d.substr(3, 3), 16),        a = parseInt("0x" + d.substr(6, 3), 16);      d = parseInt("0x" + d.substr(9, 3), 16);      return K(c) + K(b) + K(a) + K(d);      console.log(K(c) + K(b) + K(a) + K(d))    }    (H(F).substr(0, 4) + H(E).substr(0, 4) + H(D).substr(0, 4))}  //传过来的时候,chunk = “你好”    req.on(‘data‘, function(chunk){    #添加post请求        process.stdout.write(chunk+‘\n‘);        // // process.stdout.write(hh + ‘\n‘);        // var hhh = "你好"        rrr = chunk.toString()        process.stdout.write(typeof rrr  + ‘\n‘);        process.stdout.write(rrr+‘\n‘);        mm = dictCrypto(rrr);        post +=mm;    });    req.on(‘end‘, function(){            post = querystring.parse(post);        res.end(util.inspect(post));    });}).listen(8888);console.log(‘Server running at http://127.0.0.1:8888/‘);
View Code

 

第三步:正常的python代码,去访问本地的服务器,直接把转换完的数据存储到本地

#! /usr/bin/env python#coding: utf-8import reimport osimport requestsimport sysimport jsonreload(sys)sys.setdefaultencoding(‘utf-8‘)path = "D:\\106_data\\juhai_data\\"ff = open(path + "answer_1.txt",‘a‘)f = open("data_1.dict")   #这个是你的词典,按照行来访问词典tt = {}i = 1j = 1s = requests.session()s.keep_alive = Falsewhile 1:    word = f.readline()    if not word:        ans = json.dumps(tt)        ff.write(ans)        break    print word,    if (i%100000 == 0):#一万个词存储一次,存的格式为字典        j = j + 1        ans = json.dumps(tt)        ff.write(ans)        ff.close()        ff = open(path + "answer_" +str(j) + ".txt",‘a‘)        tt = {}    word = word.strip(‘\n‘)    html = requests.post("http://127.0.0.1:8888/",data =http://www.mamicode.com/word,headers={‘Connection‘:‘close‘})    print html.text    xx = re.search("{ (.*?): ‘‘ }",html.text,re.S)#用到了正则去提取内容    try:        xx = xx.group(1)        xx = xx.strip("‘")    except:        continue    tt[word] = xx    print xx    i = i+1    s = requests.session()    s.keep_alive = Falseff.close()f.close()

 

爬虫:爬取海词的翻译内容