首页 > 代码库 > python爬虫获取豆瓣网前250部电影的详细信息

python爬虫获取豆瓣网前250部电影的详细信息

网址 https://movie.douban.com/top250

一共250部电影,有分页,获取每一部的详细信息

不采用框架,使用 urilib读取网页,re进行正则表达式匹配,lxml进行xpath查找

 1 from film import *
 2 from urllib import request
 3 import time,re
 4 url=rhttps://movie.douban.com/top250?start=
 5 for i in range(10):
 6     url=url+str(i*25)
 7     print(url)
 8     
 9     headers = {
10         User-Agent: rMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) 
11                       rChrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3,
12         Connection: keep-alive
13     }
14     req=request.Request(url,headers=headers)
15     page=request.urlopen(req).read()
16     page=page.decode(utf-8)
17     #fp=open("page.txt",mode="w",encoding="UTF-8")
18     #fp.writelines(page)
19     p=re.compile(r\<em\sclass=\"\"\>\d+\</em\>\s*\<a\shref=http://www.mamicode.com/"https://movie.douban.com/subject/\d+/\"\>)
20     result=p.findall(page)
21     for item in result:
22         #print(item)
23         p=re.compile(r\d+)
24         no=p.findall(item)
25         #print(no[0])
26         p=re.compile(rhttps://movie.douban.com/subject/\d+/)
27         rurl=p.findall(item)
28         #print(rurl[0])
29         filma=film(no[0],rurl[0],‘‘,‘‘,‘‘,‘‘,‘‘,‘‘)
30         filma.getall()
31         filma.detail()
32         time.sleep(3)
33     #print (result)
34     time.sleep(3)
35     #print(i)

 

film.py 如果要做数据的持久化,在这里实现

 1 from urllib import request
 2 from lxml import etree
 3 class film:
 4     def __init__(self,no,url,name,year,score,director,classification,actor):
 5         self.name=name
 6         self.year=year
 7         self.score=score
 8         self.director=director
 9         self.classification=classification
10         self.actor=actor
11         self.url=url
12         self.no=no
13     
14     def detail(self):
15         temp = "No:%s;url:%s;片名:%s;年份:%s;分数:%s;导演:%s;分级:%s;演员:%s;"   %(self.no,self.url,self.name,self.year,self.score,self.director,self.classification,self.actor)  
16         print(temp)
17     def getall(self):
18         headers={
19         User-Agent: rMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) 
20                       rChrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3,
21         Connection: keep-alive
22         }
23         req=request.Request(self.url,headers=headers)
24         page=request.urlopen(req).read()
25         page=page.decode(utf-8)
26         selector=etree.HTML(page)
27         print (page)
28         self.name=selector.xpath(/html/body/div[3]/div[1]/h1/span[1]/text())
29         self.year=selector.xpath(//*[@id="content"]/h1/span[2]/text())
30         self.score=selector.xpath(//*[@id="interest_sectl"]/div[1]/div[2]/strong/text())
31         self.director=selector.xpath(//*[@id="info"]/span[1]/span[2]/a/text())
32         self.classification=selector.xpath(//*[@id="info"]/span[5]/text())
33         self.actor=selector.xpath(//*[@id="info"]/span[3]/span[2]/a/text())
34         
35         

 

python爬虫获取豆瓣网前250部电影的详细信息