首页 > 代码库 > 用Python写爬虫爬取58同城二手交易数据

用Python写爬虫爬取58同城二手交易数据

爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意

模块1 获取分类url列表

 

from bs4 import BeautifulSoupimport requests,pymongomain_url = ‘http://bj.58.com/sale.shtml‘client = pymongo.MongoClient(‘localhost‘,27017)tc_58 = client[‘58tc‘]tab_link_list = tc_58[‘link_list‘]web_data = http://www.mamicode.com/requests.get(main_url)>for link in sub_menu_link:    link = ‘http://bj.58.com‘ + link.get(‘href‘)    #print(link)    if link == ‘http://bj.58.com/shoujihao/‘:        pass    elif link == ‘http://bj.58.com/tongxunyw/‘:        pass    elif link == ‘http://bj.58.com/tiaozao/‘:        count += 1        if count == 1:            data = http://www.mamicode.com/{‘link‘:link}>else:        data = http://www.mamicode.com/{‘link‘: link}>for i in link_list:    tab_link_list.insert(i)
模块2 获取每个商品详情信息

 

 

from bs4 import BeautifulSoupimport requests,re,pymongo,sysfrom multiprocessing import Poolclient = pymongo.MongoClient(‘localhost‘,27017)tc_58 = client[‘58tc‘]# detail_link = tc_58[‘detail_link‘]tab_link_list = tc_58[‘link_list‘]# tc_58_data = http://www.mamicode.com/client[‘58tcData‘]>def getDetailUrl(page_url,tab):    url_list = []    web_data = http://www.mamicode.com/requests.get(page_url)    soup = BeautifulSoup(web_data.text,‘lxml‘)    detail_url = soup.select(‘div.infocon > table > tbody > tr > td.t > a[onclick]‘)    #获取详细页面url    for url in detail_url:        url_list.append(url.get(‘href‘).split(‘?‘)[0])    #插入mongodb    count = 0    client = pymongo.MongoClient(‘localhost‘, 27017)    tc_58 = client[‘58tc‘]    tab_list = tc_58[tab+‘_list‘]    for i in url_list:        count += 1        tab_list.insert({‘link‘:i})    return countoriginal_price_patt = re.compile(‘原价:(.+)‘)def getInfo(detail_url):    try:        web_data = http://www.mamicode.com/requests.get(detail_url)        soup = BeautifulSoup(web_data.text,‘lxml‘)        title = soup.title.text.strip()        view_count = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time‘)[0].text        want_count = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person‘)[0].text        current_price = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i‘)        current_price = current_price[0].text if current_price else Noneoriginal_price = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b‘)        original_price = original_price[0].text if original_price else Noneoriginal_price = re.findall(original_price_patt,original_price) if original_price else Nonelocation = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i‘)[0].text        tag = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li‘)        tag = list(tag[0].stripped_strings) if tag else Noneseller_name = soup.select(‘body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name‘)[0].text        # level = soup.select(‘body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span‘)        # level = str(level[0]).split(‘\n‘)        #        # full_count = 0        # half_count = 0        # for j in level:        #     if ‘<span class="icon_png "></span>‘ == j:        #         full_count += 1        #     elif ‘<span class="icon_png smallScore"></span>‘ == j:        #         half_count += 1        full_count = len(soup.find_all(‘span‘, class_=‘icon_png ‘))        half_count = len(soup.find_all(‘span‘, class_=‘icon_png smallScore‘))        level_count = {‘full‘:full_count,‘half‘:half_count}        desc = soup.select(‘body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p‘)        desc = desc[0].text if desc else Nonedata = http://www.mamicode.com/{>detail_url}        return data    except:        print(sys.exc_info()[0], sys.exc_info()[1])        return None# for i in tab_link_list.find({},{‘link‘:1,‘_id‘:0}):#     print(i[‘link‘])#     getDetailUrl(i[‘link‘])#规律每个页面最多70页def insertDetailLin(sub_menu_list):    patt = re.compile(‘.+?com/([a-z]+)/‘)    tab_list = []    for i in sub_menu_list.find({},{‘link‘:1,‘_id‘:0}):    #for i in [{‘link‘:‘http://bj.58.com/shouji/‘}]:        i = i[‘link‘]        sub_menu_name = re.findall(patt,i)[0]        print(sub_menu_name+‘: ‘,end=‘‘)        url_list = []        for j in range(1,71):            link = i + ‘pn‘ + str(j)            url_list.append(link)        cnt = 0        for k in url_list:            cnt = cnt + getDetailUrl(k, sub_menu_name)        print(str(cnt) + ‘ lines inserted‘)        if cnt != 0:            tab_list.append(sub_menu_name+‘_list‘)    return tab_list# for i in tab_link_list.find({},{‘link‘:1,‘_id‘:0}):#     print(i)#insertDetailLin(tab_link_list)allMenCollectionName = tc_58.collection_names()#allMenCollectionName.remove(‘detail_link‘)allMenCollectionName.remove(‘link_list‘)def insertData(tab_name):    client = pymongo.MongoClient(‘localhost‘, 27017)    tc_58 = client[‘58tc‘]    tc_58_data = http://www.mamicode.com/client[‘58tcDataNew‘]>tab_name[:-5]    fenLei = tc_58_data[fenLei+‘_data‘]    tab_name = tc_58[tab_name]    #print(tab_name)    for i in tab_name.find({},{‘link‘:1,‘_id‘:0}):        data = http://www.mamicode.com/getInfo(i[‘link‘])>def getContinuingly(fenlei):    client = pymongo.MongoClient(‘localhost‘,27017)    tc_58_data = http://www.mamicode.com/client[‘58tcDataNew‘]>fenlei+‘_data‘]    fenlei_list = tc_58[fenlei+‘_list‘]    db_urls = [item[‘link‘] for item in fenlei_data.find()]    index_url = [item[‘link‘] for item in fenlei_list.find()]    x=set(db_urls)    y=set(index_url)    rest_of_urls = y-x    return list(rest_of_urls)def startgetContinuingly(fenlei):    client = pymongo.MongoClient(‘localhost‘, 27017)    tc_58_data = http://www.mamicode.com/client[‘58tcDataNew‘]>fenlei+‘_data‘]    #rest_of_urls = getContinuingly(‘chuang‘)    rest_of_urls = getContinuingly(fenlei)    #print(rest_of_urls)    for i in rest_of_urls:        data = http://www.mamicode.com/getInfo(i)>模块3 分析

 

 

from collections import Counterimport pymongo,chartsdef getTotalCount(database,host=None,port=None):    client = pymongo.MongoClient(host,port)    db = client[database]    tab_list = db.collection_names()    #print(tab_list)    count = 0    for i in tab_list:        count = count + db[i].find({}).count()    print(count)    return count#getTotalCount(‘58tcDataNew‘)#14700def getAreaByClassify(classify,database=‘58tcDataNew‘,host=None,port=None):    client = pymongo.MongoClient(host, port)    db = client[database]    classify = classify + ‘_data‘    #location_list = [ i[‘location‘][3:] if i[‘location‘] != ‘‘ and i[‘location‘][:2] == ‘北京‘ else None for i in db[‘bijiben_data‘].find(filter={},projection={‘location‘:1,‘_id‘:0})]    location_list = [i[‘location‘][3:] for i in db[‘yueqi_data‘].find(filter={}, projection={‘location‘: 1, ‘_id‘: 0})                     if i[‘location‘] != ‘‘ and i[‘location‘][:2] == ‘北京‘ and i[‘location‘][3:] != ‘‘]    loc_name = list(set(location_list))    dic_count = {}    for i in loc_name:        dic_count[i] = location_list.count(i)    return dic_count# bijiben_area_count = getAreaByClassify(classify=‘yueqi‘)# print(bijiben_area_count)# danche_area_count = getAreaByClassify(classify=‘danche‘)# sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count)# print(sum_area_count)def myCounter(L,database=‘58tcDataNew‘,host=None,port=None):    client = pymongo.MongoClient(host, port)    db = client[database]    tab_list = db.collection_names()    dic_0 = {}    for i in tab_list:        loc = i[:-5] + ‘_area_count‘        dic_0[loc] = 0    if not L:        return Counter(dic_0)    else:        return Counter(L[0]) + myCounter(L[1:])def getAllCount(database=‘58tcDataNew‘,host=None,port=None):    client = pymongo.MongoClient(host, port)    db = client[database]    tab_list = db.collection_names()    dic_all_count = {}    for i in tab_list:        dic = getAreaByClassify(i[:-5])        loc = i[:-5] + ‘_area_count‘        dic_all_count[loc] = dic    dic_val = [dic_all_count[x] for x in dic_all_count]    my = myCounter(dic_val)    dic_all_count[‘total_area_count‘] = dict(my)    return dic_all_countdic_all_count = getAllCount()# print(dic_all_count[‘bijiben_area_count‘])# print(dic_all_count[‘total_area_count‘])##tmp_list = []for i in dic_all_count[‘total_area_count‘]:    data = http://www.mamicode.com/{>True}}}    }charts.plot(tmp_list,show=‘inline‘,options=options)
技术分享

 

 
 

用Python写爬虫爬取58同城二手交易数据