[Python爬虫] 之四：Selenium 抓取微博数据

首页 > 代码库 > [Python爬虫] 之四：Selenium 抓取微博数据

[Python爬虫] 之四：Selenium 抓取微博数据

2024-09-10 05:58:17 216人阅读

技术分享

抓取代码：

# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import IniFile
class weibo:

    def __init__(self):
        #通过配置文件获取IEDriverServer.exe路径
        configfile = os.path.join(os.getcwd(),‘config.conf‘)
        cf = IniFile.ConfigFile(configfile)
        IEDriverServer = cf.GetValue("section", "IEDriverServer")
        #每抓取一页数据延迟的时间，单位为秒，默认为5秒
        self.pageDelay = 5
        pageInteralDelay = cf.GetValue("section", "pageInteralDelay")
        if pageInteralDelay:
            self.pageDelay = int(pageInteralDelay)

        os.environ["webdriver.ie.driver"] = IEDriverServer
        self.driver = webdriver.Ie(IEDriverServer)


    def printTop(self,topic):
        items = topic.split(‘@‘)
        ht = items[0].replace(‘\n‘, ‘‘)
        print ‘话题：     %s‘ % ht
        its = items[1].split(‘ñ‘)
        author_time_nums = items[1].split(‘ñ‘)
        author_time = author_time_nums[0]
        nums = author_time_nums[1] #今天 00:14
        pattern1 = re.compile(r‘今天\s{1}\d{2}:\d{2}|\d{1,2}月\d{1,2}日\s{1}\d{2}:\d{2}‘)
        time1 = re.findall(pattern1, author_time)
        print ‘话题作者： @%s‘ % author_time.split(‘ ‘)[0]
        print ‘时间：     %s‘ % time1[0]
        print ‘点赞：     %s‘ % nums.split(‘ ‘)[0]
        print ‘评论：     %s‘ % nums.split(‘ ‘)[1]
        print ‘转发：     %s‘ % nums.split(‘ ‘)[2]
        print ‘ ‘

    def CatchData(self,classname,firstUrl):
        ‘‘‘
        抓取数据
        :param id: 要获取元素标签的ID
        :param firstUrl: 首页Url
        :return:
        ‘‘‘
        start = time.clock()
        #加载首页
        wait = ui.WebDriverWait(self.driver, 10)
        self.driver.get(firstUrl)
        #打印标题

        print self.driver.title
        time.sleep(20)
        wait.until(lambda driver: self.driver.find_elements_by_xpath(classname))
        Elements = self.driver.find_elements_by_xpath(classname)
        for element in Elements:
            print ‘ ‘
            txt =  element.text.encode(‘utf8‘)
            self.printTop(txt)

        self.driver.close()
        self.driver.quit()
        end = time.clock()
        print ‘ ‘
        print "整个过程用时间: %f 秒" % (end - start)

# #测试抓取微博数据
obj = weibo()
firstUrl = "http://weibo.com/?category=0"
obj.CatchData("//li[@class=‘pt_li pt_li_1 S_bg2‘]",firstUrl)

[Python爬虫] 之四：Selenium 抓取微博数据

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > [Python爬虫] 之四：Selenium 抓取微博数据

[Python爬虫] 之四：Selenium 抓取微博数据

看完仍有疑问？有类似问题直接问程序猿