使用Python爬取mobi格式电纸书

首页 > 代码库 > 使用Python爬取mobi格式电纸书

使用Python爬取mobi格式电纸书

2024-08-05 06:54:29 217人阅读

最近做了个微信推送kindle电子书的公众号：kindle免费书库

不过目前电子书不算非常多，所以需要使用爬虫来获取足够书籍。

于是，写了以下这个爬虫，来爬取kindle114的电子书。

值得注意的地方：

当爬取数过大时，由于对方有开启放抓取，会返回一个javascript而非原始的html，所以我使用

的PyV8来执行这段js从而拿到真正的地址。

目前存在的问题：

正则式写得还不够好，毕竟是第一次正式写爬虫：）

无法下载需要购买的附件

爬虫为单线程，爬完整个网站速度慢。我有试过转成多进程，但是貌似由于不能同时登陆，大多数

爬虫进程都无法正常爬取@@

# -*- coding: utf-8 -*-import urllib2import reimport requestsimport osimport hashlibdef fuckJS(js):    import PyV8    import re    #去掉<script>标签    js=js[31:-9]    for st in [‘window‘,‘location‘,"‘assign‘","‘href‘","‘replace‘"]:        equal=re.findall(‘[_A-Za-z0-9 =]+%s;‘%st,js)#找到变量赋值等式        if equal==[]:#有可能没有            continue        else:            equal=equal[0]        var=equal.split(‘=‘)[0].strip()#找出变量名        #把等式干掉        js=js.replace(equal,‘‘)        #把变量替换成它真正的意思        js=js.replace(var,st)        #把[‘xx‘] 替换成 .xx        js=js.replace("[‘%s‘]"%st.strip("‘"),‘.%s‘%st.strip("‘"))    #将 window.href= http://www.mamicode.com/后的内容踢掉，因为当PyV8只输出最后一个等式的值    if re.findall(‘window\.href=http://www.mamicode.com/.+‘,js)!=[]:        js=js.replace(re.findall(‘window\.href=http://www.mamicode.com/.+‘,js)[0],‘‘)    #删掉location.xxx=    js=js.replace(‘location.href=http://www.mamicode.com/‘,‘‘).replace(‘location.replace‘,‘‘).replace(‘location.assign‘,‘‘)    #交给你了-v-    ctxt2 = PyV8.JSContext()    ctxt2.enter()    #print ctxt2.eval(js)    trueAddr = ctxt2.eval(js)    print trueAddr    return trueAddrdef downloadMobi(name, url):    #去掉windows下不合法的文件名    unlawName = ‘<>/\\|:""*?‘    for i in unlawName:        name = name.replace(i, ‘‘)    #正则表达式写的不够好导致的问题@@    if name.count(‘ &nbsp;img src=http://www.mamicode.com/templateyeei_dream1cssyeeidigest_1.gif class=vm alt= title= ‘) > 0:        name = name.split(‘ &nbsp‘)[0]+‘.mobi‘    #避免重复下载    if os.path.exists(‘D:\Kindle114SpiderDownload\\‘ + name):        print ‘already have‘, name        return    url = url.split(‘ ‘)[0]    s = requests.session()    username = ‘你的用户名‘    password = ‘你的密码‘    passwordMd5 = hashlib.md5(password).hexdigest()    data = {‘formhash‘: ‘23cd6c29‘, ‘referer‘: ‘‘,‘username‘: username, ‘password‘: passwordMd5, ‘questionid‘:‘0‘, ‘answer‘:‘‘}    res=s.post(‘http://www.kindle114.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=LYn7n&inajax=1‘,data)    #res = s.get(‘http://www.kindle114.com/forum.php?mod=attachment&aid=MTQ2NTB8ZjhkNjY3NmF8MTQxNjg5OTYxOXw0NDIxfDczNjI%3D‘)    try:        res = s.get(url, timeout = 200)    except:        print ‘time out for ‘, name    #print ‘content[:50]‘    #print res.content[:50]    if res.content.count(‘<!DOCTYPE html‘) > 0:        print ‘!!!!!!!!!!!!!!!!!not a mobi, this file need gold coin!!!!!!!!!!!!!!!‘        return    try:        with open(‘D:\\Kindle114SpiderDownload\\‘ + name, "wb") as code:                code.write(res.content)    except:        print ‘!!!!!!!!!!!!!!!!!!!!!遇到不合法文件名!!!!!!!!!!!!!!!!!!‘, namedef spiderThread(url, threadName):    req = urllib2.urlopen(url, timeout = 10)    text = req.read()    if text.count(‘<!DOCTYPE html‘) == 0:        js = text        trueURL = ‘http://www.kindle114.com/‘ + fuckJS(js)        print ‘trueURL‘, trueURL        req = urllib2.urlopen(trueURL)        text = req.read()        #href = http://www.mamicode.com/‘    href = http://www.mamicode.com/‘<a href="http://www.mamicode.com/(.*?)".*?target="_blank">(.*?)</a>‘    href_re = re.compile(href)    href_info = href_re.findall(text)        bookSum = 0    for i in href_info:        if i[1].count(‘.mobi‘) > 0:            bookSum+=1    if bookSum == 0:        print ‘!!!bookSum = 0!!!!‘, text[:100]    if bookSum == 1:        print ‘only one book in this thread‘        bookFileName = threadName + ‘.mobi‘        for i in href_info:            if i[1].count(‘.mobi‘) > 0:                link = i[0].replace(‘amp;‘,‘‘)                break        print link, bookFileName        downloadMobi(bookFileName, link)    else:        print str(bookSum), ‘in this thread‘        for i in href_info:            if i[1].count(‘.mobi‘) > 0:                link = i[0].replace(‘amp;‘,‘‘)                bookFileName = i[1]                print link, bookFileName                downloadMobi(bookFileName, link)for pageNum in range(1, 125):        url = ‘http://www.kindle114.com/forum.php?mod=forumdisplay&fid=2&filter=sortid&sortid=1&searchsort=1&geshi=1&page=‘ + str(pageNum)    print ‘=============url‘, url,‘===============‘    try:        req = urllib2.urlopen(url, timeout = 10)    except:        print ‘page time out‘, url    text = req.read()    href = ‘<h4><a href="http://www.mamicode.com/(.*?)" target="_blank" class="xst">(.*?)<span class="xi1">‘    href_re = re.compile(href)    href_info = href_re.findall(text)    for i in href_info:        print i[0], i[1]        url = ‘http://www.kindle114.com/‘+i[0]        threadName = i[1]        try:            spiderThread(url, threadName)        except Exception , e:            print ‘!!!!!!!!!!!!! Error with ‘,threadName, url,‘!!!!!!!!!!!!!!!!‘            print eraw_input(‘finish all!!!‘)

使用Python爬取mobi格式电纸书

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 使用Python爬取mobi格式电纸书

使用Python爬取mobi格式电纸书

看完仍有疑问？有类似问题直接问程序猿