首页 > 代码库 > 使用Python爬取mobi格式电纸书

使用Python爬取mobi格式电纸书

最近做了个微信推送kindle电子书的公众号:kindle免费书库

不过目前电子书不算非常多,所以需要使用爬虫来获取足够书籍。

于是,写了以下这个爬虫,来爬取kindle114的电子书。

值得注意的地方:

当爬取数过大时,由于对方有开启放抓取,会返回一个javascript而非原始的html,所以我使用

的PyV8来执行这段js从而拿到真正的地址。

目前存在的问题:

正则式写得还不够好,毕竟是第一次正式写爬虫:)

无法下载需要购买的附件

爬虫为单线程,爬完整个网站速度慢。我有试过转成多进程,但是貌似由于不能同时登陆,大多数

爬虫进程都无法正常爬取@@

# -*- coding: utf-8 -*-import urllib2import reimport requestsimport osimport hashlibdef fuckJS(js):    import PyV8    import re    #去掉<script>标签    js=js[31:-9]    for st in [window,location,"‘assign‘","‘href‘","‘replace‘"]:        equal=re.findall([_A-Za-z0-9 =]+%s;%st,js)#找到变量赋值等式        if equal==[]:#有可能没有            continue        else:            equal=equal[0]        var=equal.split(=)[0].strip()#找出变量名        #把等式干掉        js=js.replace(equal,‘‘)        #把变量替换成它真正的意思        js=js.replace(var,st)        #把[‘xx‘] 替换成 .xx        js=js.replace("[‘%s‘]"%st.strip(""),.%s%st.strip(""))    #将 window.href= http://www.mamicode.com/后的内容踢掉,因为当PyV8只输出最后一个等式的值    if re.findall(window\.href=http://www.mamicode.com/.+,js)!=[]:        js=js.replace(re.findall(window\.href=http://www.mamicode.com/.+,js)[0],‘‘)    #删掉location.xxx=    js=js.replace(location.href=http://www.mamicode.com/,‘‘).replace(location.replace,‘‘).replace(location.assign,‘‘)    #交给你了-v-    ctxt2 = PyV8.JSContext()    ctxt2.enter()    #print ctxt2.eval(js)    trueAddr = ctxt2.eval(js)    print trueAddr    return trueAddrdef downloadMobi(name, url):    #去掉windows下不合法的文件名    unlawName = <>/\\|:""*?    for i in unlawName:        name = name.replace(i, ‘‘)    #正则表达式写的不够好导致的问题@@    if name.count( &nbsp;img src=http://www.mamicode.com/templateyeei_dream1cssyeeidigest_1.gif class=vm alt= title= ) > 0:        name = name.split( &nbsp)[0]+.mobi    #避免重复下载    if os.path.exists(D:\Kindle114SpiderDownload\\ + name):        print already have, name        return    url = url.split( )[0]    s = requests.session()    username = 你的用户名    password = 你的密码    passwordMd5 = hashlib.md5(password).hexdigest()    data = {formhash: 23cd6c29, referer: ‘‘,username: username, password: passwordMd5, questionid:0, answer:‘‘}    res=s.post(http://www.kindle114.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=LYn7n&inajax=1,data)    #res = s.get(‘http://www.kindle114.com/forum.php?mod=attachment&aid=MTQ2NTB8ZjhkNjY3NmF8MTQxNjg5OTYxOXw0NDIxfDczNjI%3D‘)    try:        res = s.get(url, timeout = 200)    except:        print time out for , name    #print ‘content[:50]‘    #print res.content[:50]    if res.content.count(<!DOCTYPE html) > 0:        print !!!!!!!!!!!!!!!!!not a mobi, this file need gold coin!!!!!!!!!!!!!!!        return    try:        with open(D:\\Kindle114SpiderDownload\\ + name, "wb") as code:                code.write(res.content)    except:        print !!!!!!!!!!!!!!!!!!!!!遇到不合法文件名!!!!!!!!!!!!!!!!!!, namedef spiderThread(url, threadName):    req = urllib2.urlopen(url, timeout = 10)    text = req.read()    if text.count(<!DOCTYPE html) == 0:        js = text        trueURL = http://www.kindle114.com/ + fuckJS(js)        print trueURL, trueURL        req = urllib2.urlopen(trueURL)        text = req.read()        #href = http://www.mamicode.com/‘    href = http://www.mamicode.com/<a href="http://www.mamicode.com/(.*?)".*?target="_blank">(.*?)</a>    href_re = re.compile(href)    href_info = href_re.findall(text)        bookSum = 0    for i in href_info:        if i[1].count(.mobi) > 0:            bookSum+=1    if bookSum == 0:        print !!!bookSum = 0!!!!, text[:100]    if bookSum == 1:        print only one book in this thread        bookFileName = threadName + .mobi        for i in href_info:            if i[1].count(.mobi) > 0:                link = i[0].replace(amp;,‘‘)                break        print link, bookFileName        downloadMobi(bookFileName, link)    else:        print str(bookSum), in this thread        for i in href_info:            if i[1].count(.mobi) > 0:                link = i[0].replace(amp;,‘‘)                bookFileName = i[1]                print link, bookFileName                downloadMobi(bookFileName, link)for pageNum in range(1, 125):        url = http://www.kindle114.com/forum.php?mod=forumdisplay&fid=2&filter=sortid&sortid=1&searchsort=1&geshi=1&page= + str(pageNum)    print =============url, url,===============    try:        req = urllib2.urlopen(url, timeout = 10)    except:        print page time out, url    text = req.read()    href = <h4><a href="http://www.mamicode.com/(.*?)" target="_blank" class="xst">(.*?)<span class="xi1">    href_re = re.compile(href)    href_info = href_re.findall(text)    for i in href_info:        print i[0], i[1]        url = http://www.kindle114.com/+i[0]        threadName = i[1]        try:            spiderThread(url, threadName)        except Exception , e:            print !!!!!!!!!!!!! Error with ,threadName, url,!!!!!!!!!!!!!!!!            print eraw_input(finish all!!!)

 

使用Python爬取mobi格式电纸书