首页 > 代码库 > [Python]爬取糗事百科

[Python]爬取糗事百科

# coding=utf-8
import urllib2
import urllib
import re

class QiuShi:
   def _init_(self):
      self.page = 1
   # 从网页获取糗事
   def GetQiuShis(self,page):
	  #网址
      url = "http://www.qiushibaike.com/hot/page/"+page
	  #伪装浏览器
      user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
      headers = {'user-Agent':user_agent}
	  #请求
      req = urllib2.Request(url,headers = headers)
      response = urllib2.urlopen(req)
      html = response.read()
      #encode的作用是将unicode编码转换成其他编码的字符串    
      #decode的作用是将其他编码的字符串转换成unicode编码
      unicodeHtml = html.decode("utf-8") 
      items = re.findall('<div.*?class="content".*?title="(.*?)">(.*?)</div>',unicodeHtml,re.S)
      contents = []    
      for item in items:    
         # item 中第一个是div的标题,也就是时间    
         # item 中第二个是div的内容,也就是内容    
         contents.append([item[0].replace("\n",""),item[1].replace("\n","")])    
      return contents
	  
   #打印糗事	  
   def ShowQiuShi(self,contents):
      count = 1
      for content in contents:
         print "第%d条糗事" % count,content[0],"\n"
         print content[1],"\n"
         count += 1
   #启动
   def Start(self):
      page = 1
      while page < 5:
         print "第%d页:\n" % page
         contents = self.GetQiuShis(str(page))
         self.ShowQiuShi(contents)
         page += 1		 
qiuShi = QiuShi()
qiuShi.Start()
	  
	  
	  


技术分享



[Python]爬取糗事百科