首页 > 代码库 > 用selenium爬取淘宝美食

用selenium爬取淘宝美食

技术分享
‘‘‘利用selenium爬取淘宝美食网页内容‘‘‘

import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *

driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

driver.set_window_size(1400,900)            #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误

def search():
    print(正在搜索)
    try:
        driver.get(http://www.taobao.com)
        s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#q)))
        sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,#J_TSearchForm > div.search-button > button)))
        s_input.send_keys(KEYWORD)
        sumbit.click()
        totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#mainsrp-pager > div > div > div > div.total)))
        get_products()
        return totle.text
    except TimeoutException:
        print(TimeOut)
        return search()

def next_page(page_number):
    print(正在翻页, page_number)
    try:
        s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > input)))
        sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > span.btn.J_Submit)))
        s_input.clear()
        s_input.send_keys(page_number)
        sumbit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,#mainsrp-pager > div > div > div > ul > li.item.active > span),str(page_number)))
        get_products()
    except TimeoutException:
        print(TimeOut)
        next_page(page_number)

def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#mainsrp-itemlist .items .item)))
    html = driver.page_source
    doc = pq(html)
    items = doc(#mainsrp-itemlist .items .item).items()
    for item in items:
        product = {
            image: item.find(.pic .img).attr(src),
            price:item.find(.price).text(),
            deal: item.find(.deal-cnt).text()[:-3],
            title: item.find(.title).text(),
            shop: item.find(.shop).text(),
            location: item.find(.location).text()
        }
        print(product)


def main():
    try:
        totle = search()
        totle = int(re.compile((\d+)).search(totle).group(1))
        for num in range(2,totle + 1):
            next_page(num)
    except Exception as e:
        print(e)
    finally:        #最后执行的操作
        driver.close()

if __name__  == __main__:
    main()
View Code

config文件

技术分享
SERVICE_ARGS = [--load-images=false, --disk-cache=true]
KEYWORD = 美食
View Code

 

用selenium爬取淘宝美食