首页 > 代码库 > 爬虫之获取当当网全部图书
爬虫之获取当当网全部图书
#encoding:utf-8
#
#author:wuhao
#
#******
#爬取当当网图书,未使用框架
#main是主函数
#KindLinks.py和 获取数据信息.py 是2个封装的类
#KindLinks只有一个方法,它返回的是 listUrl---(name(小分类名称),url(小分类对应的链接)) LB---(总的分类)
#获取数据信息有2个方法,---getpage(),getinfo() getpage()返回的是页码数,getinfo()返回的是每本书中的信息(书名,评论数,作者,出版社,价格,出版日期) 书名我没有进行进一步的解析,可能比较杂乱
#当当网商品种类链接,获取不同种类的所有图书 from bs4 import BeautifulSoup class _FirstPageLinkToGetUrl(): def __init__(self,opener): self.opener=opener self.url="http://category.dangdang.com/?ref=www-0-C" def getDifferentSeriesBookUrl(self): html=self.opener.open(self.url).read().decode("gbk") soup=BeautifulSoup(html,"html.parser") #类别 LB = [] # 字典存储小类别对应的URL #dictUrl = {} # temp=0 listUrl=[] count=[] #outside ---外层的div #_li ---li层 for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"): LB.append(outsideDiv.div.a.string) temp=0 dictUrl={} for _li in outsideDiv.find("ul").find_all("li"): if _li.a.string == "更多": continue else: # print(s.a.get("href"), s.a.string) temp+=1 dictUrl[_li.a.string] = _li.a.get("href") count.append(temp) listUrl.append(dictUrl) return listUrl,LB
#获取网页中包含的图书的信息
from bs4 import BeautifulSoup import re class _GetBookInfo(): def __init__(self,opener): self.opener=opener def getPage(self,url): html = self.opener.open(url) html = html.read().decode("gbk") # 网页数据 with open("test.txt","w") as f: f.write(html) regex=re.compile("<span>/\d+</span>") valueNum=re.findall("\d+",regex.findall(html)[0]) return int(valueNum[0]) def getInfo(self,url): html = self.opener.open(url).read().decode("gbk") soup = BeautifulSoup(html,"html.parser") ulTag=soup.find("ul",class_="list_aa listimg",id=True) liTag=ulTag.find_all("li",id=True) data1=[] #遍历liTag temp=0 for li in liTag: data = [] try: data.append(li.find("p",class_="name").string) data.append(li.find("p",class_="star").a.string) data.append(li.find("p",class_="author").a.string) data.append(li.find("p",class_="publishing").a.string) data.append(li.find("p",class_="price").span.string) data.append(re.findall(r"/ .+ ",str(li.find("p", class_="publishing_time")))[0].replace(" ","").replace("/","")) data1.append(data) except:continue #print(data) return data1 # ‘‘‘ def getDifferentSeriesBookUrl(self): html=self.opener.open(self.url).read().decode("gbk") soup=BeautifulSoup(html) #类别 LB = [] # 字典存储小类别对应的URL dictUrl = {} #outside ---外层的div #_li ---li层 for outsideDiv in soup.find("div", class_="classify_books", id="floor_1").find_all("div", class_="classify_kind"): LB.append(outsideDiv.div.a.string) for _li in outsideDiv.find("ul").find_all("li"): if _li.a.string == "更多": continue else: # print(s.a.get("href"), s.a.string) dictUrl[_li.a.string] = _li.a.get("href") return dictUrl,LB ‘‘‘
#-encoding:utf-8 from 当当网图书爬取 import 获取数据信息 as bookInfo from 当当网图书爬取 import KindLinks as kls from bs4 import BeautifulSoup import urllib.request import urllib.parse import http.cookiejar import re import xlwt import xlrd def getCorrectUrl(url,page): if page==0: return url url=url.replace("m/","m/pg"+str(page)+"-") return url #url,当当网所有商品网页 url="http://category.dangdang.com/?ref=www-0-C" #创建实例化对象 Cookie=http.cookiejar.CookieJar() #创建处理器 CookieHandle=urllib.request.HTTPCookieProcessor(Cookie) #创建opener opener=urllib.request.build_opener(CookieHandle) #模拟浏览器登录 header= { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" } head=[] for key,value in header.items(): elem=(key,value) head.append(elem) opener.addheaders=head #打开一次网页让opener具备Cookie opener.open(url) #首先获取相关链接从KindLinks _kls=kls._FirstPageLinkToGetUrl(opener) #书籍的链接数据 bdata=http://www.mamicode.com/_kls.getDifferentSeriesBookUrl()"书名","评论数","作者","出版社","价格","出版日期"] book=xlwt.Workbook(encoding="utf-8") #用于统计总计书的数量 count=0 for _gd in range(len(bdata_url)): for _bdata in range(len(bdata_url_name[_gd])): page = bio.getPage(bdata_url_url[_gd][_bdata]) #获取页码数 sheetname=bdata_url_name[_gd][_bdata].replace("/", "-") try: sheet=book.add_sheet(sheetname=sheetname) except:continue print(sheetname+"正在写入...") for i in range(len(StyleinfoInExcel)): sheet.write(0,i,StyleinfoInExcel[i]) #进行数据的读取和写入 temp=0 for CurrentPage in range(1,page,1): #CurrentPage为实际爬取到的网页页码 try: data=http://www.mamicode.com/bio.getInfo(getCorrectUrl(bdata_url_url[_gd][_bdata],CurrentPage)) #数据保存到data中" ") sheet.write(temp,j,data[i][j]) count+=1 except:continue print("已写入"+str(count)+"本书") print(sheetname+"写入完成...\r\n") if _bdata=http://www.mamicode.com/=len(bdata_url_name[_gd])-1:"/","-")+".xls") book = xlwt.Workbook(encoding="utf-8") print("--------已完成"+bdata_gd[_gd]) # print("写入完成,共计"+str(count)+"本书")
爬虫之获取当当网全部图书
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。