首页 > 代码库 > [Python爬虫] 之四:Selenium 抓取微博数据

[Python爬虫] 之四:Selenium 抓取微博数据

技术分享

抓取代码:

# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import IniFile
class weibo:

def __init__(self):
#通过配置文件获取IEDriverServer.exe路径
configfile = os.path.join(os.getcwd(),‘config.conf‘)
cf = IniFile.ConfigFile(configfile)
IEDriverServer = cf.GetValue("section", "IEDriverServer")
#每抓取一页数据延迟的时间,单位为秒,默认为5秒
self.pageDelay = 5
pageInteralDelay = cf.GetValue("section", "pageInteralDelay")
if pageInteralDelay:
self.pageDelay = int(pageInteralDelay)

os.environ["webdriver.ie.driver"] = IEDriverServer
self.driver = webdriver.Ie(IEDriverServer)


def printTop(self,topic):
items = topic.split(‘@‘)
ht = items[0].replace(‘\n‘, ‘‘)
print ‘话题: %s‘ % ht
its = items[1].split(‘ñ‘)
author_time_nums = items[1].split(‘ñ‘)
author_time = author_time_nums[0]
nums = author_time_nums[1] #今天 00:14
pattern1 = re.compile(r‘今天\s{1}\d{2}:\d{2}|\d{1,2}月\d{1,2}日\s{1}\d{2}:\d{2}‘)
time1 = re.findall(pattern1, author_time)
print ‘话题作者: @%s‘ % author_time.split(‘ ‘)[0]
print ‘时间: %s‘ % time1[0]
print ‘点赞: %s‘ % nums.split(‘ ‘)[0]
print ‘评论: %s‘ % nums.split(‘ ‘)[1]
print ‘转发: %s‘ % nums.split(‘ ‘)[2]
print ‘ ‘

def CatchData(self,classname,firstUrl):
‘‘‘
抓取数据
:param id: 要获取元素标签的ID
:param firstUrl: 首页Url
:return:
‘‘‘
start = time.clock()
#加载首页
wait = ui.WebDriverWait(self.driver, 10)
self.driver.get(firstUrl)
#打印标题

print self.driver.title
time.sleep(20)
wait.until(lambda driver: self.driver.find_elements_by_xpath(classname))
Elements = self.driver.find_elements_by_xpath(classname)
for element in Elements:
print ‘ ‘
txt = element.text.encode(‘utf8‘)
self.printTop(txt)

self.driver.close()
self.driver.quit()
end = time.clock()
print ‘ ‘
print "整个过程用时间: %f 秒" % (end - start)

# #测试抓取微博数据
obj = weibo()
firstUrl = "http://weibo.com/?category=0"
obj.CatchData("//li[@class=‘pt_li pt_li_1 S_bg2‘]",firstUrl)

 

[Python爬虫] 之四:Selenium 抓取微博数据