首页 > 代码库 > python 爬取qidian某一页全部小说
python 爬取qidian某一页全部小说
1 import re 2 import urllib.request 3 from bs4 import BeautifulSoup 4 import time 5 6 url=input("第一页网址:") 7 8 def gethtml(url): 9 #获取页面源代码html 10 page=urllib.request.urlopen(url) 11 html=page.read().decode(‘utf-8‘) #html是一个列表 12 soup=BeautifulSoup(html,‘html.parser‘) 13 14 return soup 15 16 17 def getbookurl(soup): #获取该页所有书本的链接地址 18 firsturl2=[] 19 bookurl=soup.find_all("h4") 20 bookurl1=re.findall(r‘<h4><a data-bid=".*?" data-eid=".*?" href="http://www.mamicode.com/(.*?)" target="_blank"‘,str(bookurl)) 21 for i in range(0,len(bookurl1)): 22 bookurl="http:"+bookurl1[i] 23 24 soup1=gethtml(bookurl) #获取每本书第一章 的url 25 time.sleep(0.2) 26 firsturl=soup1.find_all("a",{"class":"red-btn J-getJumpUrl "}) 27 firsturl1=re.findall(r‘data-firstchapterjumpurl=".*?" href="http://www.mamicode.com/(.*?)" id="readBtn">‘,str(firsturl)) 28 if firsturl1[0]==‘‘: #这里要进行判断,防止出错 29 continue 30 firsturl2.append(firsturl1[0]) 31 return firsturl2 32 33 34 35 36 def getcontent(soup,load): 37 38 content=soup.find_all("div",{"class":"read-content j_readContent"}) 39 40 content1=re.compile(r‘<p>([\s\S]*?)</p>‘) 41 42 content2=content1.findall(str(content)) 43 44 content3=re.sub("</?\w+[^>]*>",‘‘,content2[0]) 45 46 content4=content3.replace(‘。‘,‘。\n\n\0\0\0‘) #到此,将章节内容获取完毕 47 48 contentname=re.compile(r‘<h3 class="j_chapterName">(.*?)</h3>‘) 49 50 contentname1=contentname.findall(str(soup)) #获取章节名称 51 52 book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4 53 54 with open(load, ‘a‘) as f: 55 56 f.write(book) 57 58 59 60 def nextcontent(soup): 61 62 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 63 64 #print(str(content)) 65 66 step=re.compile(r‘<a data-eid="qd_R109" href="http://www.mamicode.com/(.*?)" id="j_chapterNext">‘) 67 68 content1=step.findall(str(content)) 69 70 if content1 == []: 71 72 step1=re.compile(r‘<a data-eid="qd_R118" href="http://www.mamicode.com/(.*?)" id="j_chapterNext">‘) 73 74 content2=step1.findall(str(content)) 75 76 url="http:"+content2[0] 77 78 return url 79 else: 80 url="http:"+content1[0] 81 82 return url 83 84 def panduan(soup): 85 86 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 87 88 #print(str(content)) 89 90 step=re.compile(r‘<a data-eid="qd_R109" href="http://www.mamicode.com/(.*?)" id="j_chapterNext">‘) 91 92 content1=step.findall(str(content)) 93 94 return content1 95 #------------------------------------------------------------------------- 96 97 98 99 #-------------------------------------------------------------------------100 101 while 1==1:102 soup2=gethtml(url)103 firsturl2=getbookurl(soup2)104 105 for j in range(0,len(firsturl2)):106 url="http:"+firsturl2[j]107 soup1=gethtml("http:"+firsturl2[j])108 bookname=re.findall(r‘<h1>(.*?)</h1>‘ ,str(soup1))109 load="d:/88/%s.txt" % bookname[0]110 i=0111 while 1==1:112 soup=gethtml(url)113 getcontent(soup,load)114 url=nextcontent(soup)115 content1=panduan(soup)116 i+=1117 print("第%d章下载完成" % i)118 119 if content1 == []:120 break121 122 time.sleep(0.2)123 print("-------------第%d本书下载完成---------" % int(j+1))124
学习ing!!!
python 爬取qidian某一页全部小说
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。