首页 > 代码库 > python 爬起点目录

python 爬起点目录

 1 #目标:书名,简介,作者,字数 2 #首先确定源代码的列表 3 import urllib.request 4 import re 5 from bs4 import BeautifulSoup 6 import random 7 import time 8  9 load=input("路径:")10 num=input("输入页数:")11 12 13 14 15 def gethtml(url):                     #获取页面源代码html16     page=urllib.request.urlopen(url)17     html=page.read().decode(utf-8)  #html是一个列表18     soup=BeautifulSoup(html,html.parser)19     return soup20 21 def getbook(soup,load):22     for i in range(1,21):23 24         xl=soup.find_all("li",{"data-rid":str(i)})25         sm = re.compile(r<h4><a .*?>(.*?)</a></h4>)    #匹配书名26         sm1=sm.findall(str(xl))27         a=""+sm1[0]+""28 29         ze = re.compile(r<a class="name" .*?>(.*?)</a>)30         ze1 = ze.findall(str(xl))                        #匹配作者名31         b=ze1[0]32 33         jj=re.compile(r<p class="intro">([\s\S]*?)</p>)34         jj1=jj.findall(str(xl))                          #匹配简介35         c=jj1[0]36 37         zs=re.compile(r<span>(.*?)</span>)38         zs1=zs.findall(str(xl))39         d=zs1[1]40         content=[a,b,c,d]41         42         for j in range(0,4):43             44             with open(load, a) as f:45                 if j == 3:46                     f.write(content[3])47                 else:48                     f.write(content[j]+"\n")49                 50         with open(load, a) as f:51             f.write("\n\n----------------------------------------------------------------------\n\n")52 def geturl(num):53     for page in range(1,int(num)+1):54         55         url="http://fin.qidian.com/?size=-1&sign=-1&tag=-1&chanId=-1&subCateId=-1&orderId=&update=-1&page=%d&month=-1&style=1&vip=0" % page56         57         soup=gethtml(url)58         getbook(soup,load)59         time.sleep(2.5)60         61         62 geturl(num)

实现

python 爬起点目录