首页 > 代码库 > 用Python写爬虫爬取58同城二手交易数据
用Python写爬虫爬取58同城二手交易数据
爬了14W数据,存入Mongodb,用Charts库展示统计结果,这里展示一个示意
模块1 获取分类url列表
from bs4 import BeautifulSoupimport requests,pymongomain_url = ‘http://bj.58.com/sale.shtml‘client = pymongo.MongoClient(‘localhost‘,27017)tc_58 = client[‘58tc‘]tab_link_list = tc_58[‘link_list‘]web_data = http://www.mamicode.com/requests.get(main_url)>for link in sub_menu_link: link = ‘http://bj.58.com‘ + link.get(‘href‘) #print(link) if link == ‘http://bj.58.com/shoujihao/‘: pass elif link == ‘http://bj.58.com/tongxunyw/‘: pass elif link == ‘http://bj.58.com/tiaozao/‘: count += 1 if count == 1: data = http://www.mamicode.com/{‘link‘:link}>else: data = http://www.mamicode.com/{‘link‘: link}>for i in link_list: tab_link_list.insert(i)模块2 获取每个商品详情信息
from bs4 import BeautifulSoupimport requests,re,pymongo,sysfrom multiprocessing import Poolclient = pymongo.MongoClient(‘localhost‘,27017)tc_58 = client[‘58tc‘]# detail_link = tc_58[‘detail_link‘]tab_link_list = tc_58[‘link_list‘]# tc_58_data = http://www.mamicode.com/client[‘58tcData‘]>def getDetailUrl(page_url,tab): url_list = [] web_data = http://www.mamicode.com/requests.get(page_url) soup = BeautifulSoup(web_data.text,‘lxml‘) detail_url = soup.select(‘div.infocon > table > tbody > tr > td.t > a[onclick]‘) #获取详细页面url for url in detail_url: url_list.append(url.get(‘href‘).split(‘?‘)[0]) #插入mongodb count = 0 client = pymongo.MongoClient(‘localhost‘, 27017) tc_58 = client[‘58tc‘] tab_list = tc_58[tab+‘_list‘] for i in url_list: count += 1 tab_list.insert({‘link‘:i}) return countoriginal_price_patt = re.compile(‘原价:(.+)‘)def getInfo(detail_url): try: web_data = http://www.mamicode.com/requests.get(detail_url) soup = BeautifulSoup(web_data.text,‘lxml‘) title = soup.title.text.strip() view_count = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time‘)[0].text want_count = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.want_person‘)[0].text current_price = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i‘) current_price = current_price[0].text if current_price else Noneoriginal_price = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > b‘) original_price = original_price[0].text if original_price else Noneoriginal_price = re.findall(original_price_patt,original_price) if original_price else Nonelocation = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i‘)[0].text tag = soup.select(‘body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.biaoqian_li‘) tag = list(tag[0].stripped_strings) if tag else Noneseller_name = soup.select(‘body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > p.personal_name‘)[0].text # level = soup.select(‘body > div.content > div > div.box_right > div.personal.jieshao_div > div.personal_jieshao > span‘) # level = str(level[0]).split(‘\n‘) # # full_count = 0 # half_count = 0 # for j in level: # if ‘<span class="icon_png "></span>‘ == j: # full_count += 1 # elif ‘<span class="icon_png smallScore"></span>‘ == j: # half_count += 1 full_count = len(soup.find_all(‘span‘, class_=‘icon_png ‘)) half_count = len(soup.find_all(‘span‘, class_=‘icon_png smallScore‘)) level_count = {‘full‘:full_count,‘half‘:half_count} desc = soup.select(‘body > div.content > div > div.box_left > div:nth-of-type(3) > div > div > p‘) desc = desc[0].text if desc else Nonedata = http://www.mamicode.com/{>detail_url} return data except: print(sys.exc_info()[0], sys.exc_info()[1]) return None# for i in tab_link_list.find({},{‘link‘:1,‘_id‘:0}):# print(i[‘link‘])# getDetailUrl(i[‘link‘])#规律每个页面最多70页def insertDetailLin(sub_menu_list): patt = re.compile(‘.+?com/([a-z]+)/‘) tab_list = [] for i in sub_menu_list.find({},{‘link‘:1,‘_id‘:0}): #for i in [{‘link‘:‘http://bj.58.com/shouji/‘}]: i = i[‘link‘] sub_menu_name = re.findall(patt,i)[0] print(sub_menu_name+‘: ‘,end=‘‘) url_list = [] for j in range(1,71): link = i + ‘pn‘ + str(j) url_list.append(link) cnt = 0 for k in url_list: cnt = cnt + getDetailUrl(k, sub_menu_name) print(str(cnt) + ‘ lines inserted‘) if cnt != 0: tab_list.append(sub_menu_name+‘_list‘) return tab_list# for i in tab_link_list.find({},{‘link‘:1,‘_id‘:0}):# print(i)#insertDetailLin(tab_link_list)allMenCollectionName = tc_58.collection_names()#allMenCollectionName.remove(‘detail_link‘)allMenCollectionName.remove(‘link_list‘)def insertData(tab_name): client = pymongo.MongoClient(‘localhost‘, 27017) tc_58 = client[‘58tc‘] tc_58_data = http://www.mamicode.com/client[‘58tcDataNew‘]>tab_name[:-5] fenLei = tc_58_data[fenLei+‘_data‘] tab_name = tc_58[tab_name] #print(tab_name) for i in tab_name.find({},{‘link‘:1,‘_id‘:0}): data = http://www.mamicode.com/getInfo(i[‘link‘])>def getContinuingly(fenlei): client = pymongo.MongoClient(‘localhost‘,27017) tc_58_data = http://www.mamicode.com/client[‘58tcDataNew‘]>fenlei+‘_data‘] fenlei_list = tc_58[fenlei+‘_list‘] db_urls = [item[‘link‘] for item in fenlei_data.find()] index_url = [item[‘link‘] for item in fenlei_list.find()] x=set(db_urls) y=set(index_url) rest_of_urls = y-x return list(rest_of_urls)def startgetContinuingly(fenlei): client = pymongo.MongoClient(‘localhost‘, 27017) tc_58_data = http://www.mamicode.com/client[‘58tcDataNew‘]>fenlei+‘_data‘] #rest_of_urls = getContinuingly(‘chuang‘) rest_of_urls = getContinuingly(fenlei) #print(rest_of_urls) for i in rest_of_urls: data = http://www.mamicode.com/getInfo(i)>模块3 分析
from collections import Counterimport pymongo,chartsdef getTotalCount(database,host=None,port=None): client = pymongo.MongoClient(host,port) db = client[database] tab_list = db.collection_names() #print(tab_list) count = 0 for i in tab_list: count = count + db[i].find({}).count() print(count) return count#getTotalCount(‘58tcDataNew‘)#14700def getAreaByClassify(classify,database=‘58tcDataNew‘,host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] classify = classify + ‘_data‘ #location_list = [ i[‘location‘][3:] if i[‘location‘] != ‘‘ and i[‘location‘][:2] == ‘北京‘ else None for i in db[‘bijiben_data‘].find(filter={},projection={‘location‘:1,‘_id‘:0})] location_list = [i[‘location‘][3:] for i in db[‘yueqi_data‘].find(filter={}, projection={‘location‘: 1, ‘_id‘: 0}) if i[‘location‘] != ‘‘ and i[‘location‘][:2] == ‘北京‘ and i[‘location‘][3:] != ‘‘] loc_name = list(set(location_list)) dic_count = {} for i in loc_name: dic_count[i] = location_list.count(i) return dic_count# bijiben_area_count = getAreaByClassify(classify=‘yueqi‘)# print(bijiben_area_count)# danche_area_count = getAreaByClassify(classify=‘danche‘)# sum_area_count = Counter(bijiben_area_count) + Counter(danche_area_count)# print(sum_area_count)def myCounter(L,database=‘58tcDataNew‘,host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] tab_list = db.collection_names() dic_0 = {} for i in tab_list: loc = i[:-5] + ‘_area_count‘ dic_0[loc] = 0 if not L: return Counter(dic_0) else: return Counter(L[0]) + myCounter(L[1:])def getAllCount(database=‘58tcDataNew‘,host=None,port=None): client = pymongo.MongoClient(host, port) db = client[database] tab_list = db.collection_names() dic_all_count = {} for i in tab_list: dic = getAreaByClassify(i[:-5]) loc = i[:-5] + ‘_area_count‘ dic_all_count[loc] = dic dic_val = [dic_all_count[x] for x in dic_all_count] my = myCounter(dic_val) dic_all_count[‘total_area_count‘] = dict(my) return dic_all_countdic_all_count = getAllCount()# print(dic_all_count[‘bijiben_area_count‘])# print(dic_all_count[‘total_area_count‘])##tmp_list = []for i in dic_all_count[‘total_area_count‘]: data = http://www.mamicode.com/{>True}}} }charts.plot(tmp_list,show=‘inline‘,options=options)
用Python写爬虫爬取58同城二手交易数据
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。