首页 > 代码库 > 【原创】最近写的一个比较hack的小爬虫

【原创】最近写的一个比较hack的小爬虫

目标:爬取爱漫画上面自己喜欢的一个漫画

分析阶段:

0、打开爱漫画主页,迎面就是一坨js代码。。直接晕了

1、经过抓包和对html源码的分析,可以发现爱漫画通过另外一个域名发送图片,而当前域名中通过js动态生成图片的文件名。问题就在这里了,首先,图片的文件命名模式比较多,没办法通过js源码直接爬;其次,有两种不同的图片文件名表现形式,一种是字典,一种是通过运算后返回的字典字符串。所谓字典字符串,就是[a:b]变成"[a:b]"。

version 1:

var cInfo={"bid":862,"burl":"/comic/862/list_84410.html","bname":"\u0053\u006b\u0065\u0074\u0044\u0061\u006e\u0063\u0065","cid":84410,"cname":"\u7b2c\u0032\u0038\u0038\u8bdd","len":22,"files":["JOJO_001.jpg","JOJO_002-003.jpg","JOJO_004.jpg","JOJO_005.png","JOJO_006.png","JOJO_007.png","JOJO_008.png","JOJO_009.png","JOJO_010.png","JOJO_011.png","JOJO_012.png","JOJO_013.png","JOJO_014.png","JOJO_015.png","JOJO_016.png","JOJO_017.png","JOJO_018.png","JOJO_019.png","JOJO_020.png","JOJO_021.png","JOJO_022.png","JOJO_023.png"],"finished":1};

 

version 2:

eval(function(p,a,c,k,e,d){e=function(c){return(c<a?"":e(parseInt(c/a)))+((c=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!‘‘.replace(/^/,String)){while(c--)d[e(c)]=k[c]||e(c);k=[function(e){return d[e]}];e=function(){return‘\\w+‘};c=1;};while(c--)if(k[c])p=p.replace(new RegExp(‘\\b‘+e(c)+‘\\b‘,‘g‘),k[c]);return p;}(‘E h={"i":"\\g\\e\\2\\f\\j\\n\\o\\m\\2","k":"/l/3/4.6","7":3,"5":"\\c\\d\\8\\9\\a","b":["w.0","F.0","D.0","B.0","C.0","G.0","K.0","L.0","J.0","H.0","I.0","A.0","s.0","t.0","r.0","p.0","q.0"],"u":y,"z":x,"v":1};‘,48,48,‘png||u0065|862|list_83707|cname|html|bid|u0038|u0036|u8bdd|files|u7b2c|u0032|u006b|u0074|u0053|cInfo|bname|u0044|burl|comic|u0063|u0061|u006e|016|017|015|013|014|cid|finished|001|17|83707|len|012|004|005|003|var|002|006|010|011|009|007|008‘.split(‘|‘),0,{}))

 

我只能OTL。。

 

解决:

首先,可以发现,可以设计两种模式,一种是处理verson 1的方式,另一种是将verson 2转化为verson 1之后处理。

Google了下,发现有一个叫pyv8的库,可以动态解析js代码,而经过debug分析,找到了第2个版本中储存字典字符串的变量。但是这个变量在函数中,是局部变量。但分析到了这个地步,方案就简单了。

0、加一个全局变量hogo

1、在函数返回字典字符串之前,将字典字符串赋值给hogo

2、读取hogo的值

 

当然,最后记得要改header中的Referer,否则图片服务器给你404

 

全部代码:

 

# -*- coding: UTF-8 -*-import reimport urllibimport urllib2import sysimport PyV8import os# 爬取爱漫画《sketdance》的全章页面# 找到某一话# 分析文件名,下载全话图片    # 通过执行js代码,找到具体的文件名。# 进入下一话mydir = r‘d:/sketdance/‘imghost = r‘http://c4.mangafiles.com/Files/Images/‘tarhost = r‘http://www.imanhua.com‘taraddr = r‘http://www.imanhua.com/comic/862/‘headers = {‘Referer‘:‘http://www.imanhua.com/comic/862/list_82877.html‘,‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36‘}tarRequest = urllib2.Request(taraddr,headers=headers)def downloadParts(url,title):    print ‘now is downloading ‘ + title + ‘:‘    print url    print title.decode(‘gb2312‘).encode(‘gbk‘)    os.mkdir(mydir + title.decode(‘gb2312‘).encode(‘gbk‘))    firststart = url.find(‘comic/‘) + len(‘comic/‘)    firstslash = url.find(‘/‘,firststart)    parturl = url[firststart:firstslash + 1]    sestart = url.find(‘_‘,30)    sefinis = url.find(‘.‘,sestart)    parturl = parturl + url[sestart + 1:sefinis] + ‘/‘    partsRequest = urllib2.Request(url,headers=headers)    while 1:        try:            page = urllib2.urlopen(partsRequest)        except:            pass        else:            break    page         = page.read()    jscode       = re.compile(r‘<script type="text/javascript">(.+?)</script>‘)    jscode       = jscode.search(page)    jscode       = jscode.group(1)    if jscode.find(‘split‘) != -1:        jscode = ‘var hogo;‘ + jscode        start  = jscode.find(‘return p‘)        jscode = jscode[0:start] + ‘hogo=p;‘ + jscode[start:]        with PyV8.JSContext() as env1:            env1.eval(jscode)            vars = env1.locals            jscode = vars.hogo    fileinx = jscode.find(‘files":‘)    fileinx = jscode[fileinx:]    imgre   = re.compile(r‘([^"]+(\.jpg|\.png))‘)    imglst  = imgre.findall(fileinx)    imglst  = [key[0] for key in imglst]    for index,key in enumerate(imglst):        if key.find(‘.jpg‘) != -1:            flag = ‘.jpg‘        else:            flag = ‘.png‘        print imghost + parturl + key        if index == 0:            headers[‘Referer‘]  = url        else:            headers[‘Referer‘]  = url + ‘?p=‘ + str(index)        imgRequest = urllib2.Request(imghost + parturl + key,headers=headers)        while 1:            try:                img = urllib2.urlopen(imgRequest)            except:                pass            else:                break        f = open(mydir +  title + ‘/‘ + str(index) + flag,‘wb‘)        f.write(img.read())        f.close()        def findTar(url):    while 1:        try:            page  = urllib2.urlopen(tarRequest)        except:            pass        else:            break    page  = page.read()    first = u‘SketDance漫画列表‘    first = first.encode(‘gb2312‘)    index = page.find(first)    start = index    title = ""    parts = u‘第‘    parts = parts.encode(‘gb2312‘)    while index != -1:        index = page.find(‘title="‘ + parts,index)        if index == -1:            break        finis = page.find(‘"‘,index + 7)        title = page[index + 7:finis]        start = page.rfind(‘href="http://www.mamicode.com/‘,start,index)"‘,start + 6)        downloadParts(tarhost + page[start + 6:finis],title)         index += 1        start = indexfindTar(taraddr)

  

【原创】最近写的一个比较hack的小爬虫