首页 > 代码库 > 赶集网二手数据.py

赶集网二手数据.py

#获取所有二手频道链接
import requests
from bs4 import BeautifulSoup

star_url = http://bj.ganji.com/wu/
url_host = http://bj.ganji.com
page_url = []
def get_index_url(url):
    wb_data = requests.get(url)
    if wb_data.status_code == 200:
        soup = BeautifulSoup(wb_data.text,lxml)
        links =soup.select(dl.fenlei > dt > a)
        for link in links:
            all_list = url_host + link.get(href)
            page_url.append(all_list)
        return page_url
    else:
        pass

get_index_url(star_url)



#获取所有频道里面的子链接
from bs4 import BeautifulSoup
from multiprocessing import Pool
from channel_exciting import page_url
import requests
import pymongo

client = pymongo.MongoClient(localhost,27017)
ganji = client[ganji]
url_list = ganji[url_list]
item_info = ganji[item_info]


headers = {
    user_agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36,
    Connection:keep-alive
}

def get_links_from(channel,page):
    #http://bj.ganji.com/jiaju/o3/
    url_host = {}o{}.format(channel,str(page))
    wb_data = requests.get(url_host,headers=headers)
    soup = BeautifulSoup(wb_data.text,lxml)
    for link in soup.select(td a,t):
        item_link = link.get(href).split(?)[0]
        if wb_data.status_code != 200:
            pass
        else:
            url_list.insert_one({url:item_link})
    print(url_list)

def get_all_links(channel):
    for i in range(1,100):
        get_links_from(channel,i)

if __name__ == __main__:
    list = []
    for item in page_url:
        list.append(item)
    pool = Pool()
    pool.map(get_all_links,list)


#获取所有子链接里面的数据
from multiprocessing import Pool
from page_parsing import url_list
from bs4 import BeautifulSoup
import requests
import pymongo
import time

client = pymongo.MongoClient(localhost,27017)
ganji = client[ganji]
item_info = ganji[item_info]

headers = {
    user_agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36,
    Connection:keep-alive
}


def get_items_info(url):
    wb_data = requests.get(url,headers=headers)
    soup = BeautifulSoup(wb_data.text, lxml)
    try:
        data={
            title:soup.title.text,
            price:soup.select(span.price_now > i)[0].text,
            area:soup.select(div.palce_li > span > i)[0].text,
            url:url
        }
        item_info.insert_one(data)
    except IndexError:
        pass
    else:
        print(data)
        time.sleep(2)


if __name__ == __main__:
    list = []
    for item in url_list.find():
        list.append(item[url])
    pool = Pool()
    pool.map(get_items_info,list)

 

赶集网二手数据.py