首页 > 代码库 > 使用python抓取并分析北京链家地产二手房信息

使用python抓取并分析北京链家地产二手房信息

 

 

 

  1 import requests
  2 import time
  3 from bs4 import BeautifulSoup
  4 
  5 #设置列表页URL的固定部分
  6 url=http://bj.lianjia.com/ershoufang/
  7 #设置页面页的可变部分
  8 page=(pg)
  9 
 10 #设置请求头部信息
 11 headers = {User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11,
 12 Accept:text/html;q=0.9,*/*;q=0.8,
 13 Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.3,
 14 Accept-Encoding:gzip,
 15 Connection:close,
 16 Referer:http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&wd=&eqid=c3435a7d00006bd600000003582bfd1f
 17 }
 18 
 19 #循环抓取列表页信息
 20 for i in range(1,10):
 21     if i == 1:
 22         i=str(i)
 23         a=(url+page+i+/)
 24         r=requests.get(url=a,headers=headers)
 25         html=r.content
 26     else:
 27         i=str(i)
 28         a=(url+page+i+/)
 29         r=requests.get(url=a,headers=headers)
 30         html2=r.content
 31         html = html + html2
 32 #每次间隔0.5秒
 33         time.sleep(0.5)
 34 
 35 #解析抓取的页面内容
 36 lj=BeautifulSoup(html,html.parser)
 37 
 38 #提取房源总价
 39 price=lj.find_all(div,attrs={class:priceInfo})
 40 tp=[]
 41 for a in price:
 42     totalPrice=a.span.string
 43     tp.append(totalPrice)
 44 
 45 #提取房源信息
 46     houseInfo=lj.find_all(div,attrs={class:houseInfo})
 47     hi=[]
 48 for b in houseInfo:
 49     house=b.get_text()
 50     hi.append(house)
 51 
 52 #提取房源关注度
 53     followInfo=lj.find_all(div,attrs={class:followInfo})
 54     fi=[]
 55 for c in followInfo:
 56     follow=c.get_text()
 57     fi.append(follow)
 58 
 59 #导入pandas库
 60 import pandas as pd
 61 #创建数据表
 62 house=pd.DataFrame({totalprice:tp,houseinfo:hi,followinfo:fi})
 63 #查看数据表的内容
 64 house.head()
 65 
 66 #对房源信息进行分列
 67 houseinfo_split = pd.DataFrame((x.split(|) for x in house.houseinfo),index=house.index,columns=[xiaoqu,huxing,mianji,chaoxiang,zhuangxiu,dianti])
 68 
 69 #查看分列结果
 70 houseinfo_split.head()
 71 
 72 #将分列结果拼接回原数据表
 73 house=pd.merge(house,houseinfo_split,right_index=True, left_index=True)
 74 #完成拼接后的数据表中既包含了原有字段,也包含了分列后的新增字段。
 75 #查看拼接后的数据表
 76 house.head()
 77 
 78 #对房源关注度进行分列
 79 followinfo_split = pd.DataFrame((x.split(/) for x in house.followinfo),index=house.index,columns=[guanzhu,daikan,fabu])
 80 #将分列后的关注度信息拼接回原数据表
 81 house=pd.merge(house,followinfo_split,right_index=True, left_index=True)
 82 
 83 #按房源户型类别进行汇总
 84 huxing=house.groupby(huxing)[huxing].agg(len)
 85 #查看户型汇总结果
 86 huxing
 87 
 88 #导入图表库
 89 import matplotlib.pyplot as plt
 90 #导入数值计算库
 91 import numpy as np
 92 
 93 #用len函数计算出huxing的长度
 94 l = len(huxing)
 95 # 定义一个hx空数组
 96 hx=[]
 97 for i in range(1,len(huxing)+1):
 98 
 99     hx.append(i)
100 
101 #绘制房源户型分布条形图
102 plt.rc(font, family=STXihei, size=11)
103 a=np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
104 plt.barh(hx,huxing,color=#052B6C,alpha=0.8,align=center,edgecolor=white)
105 plt.ylabel(户型)
106 plt.xlabel(数量)
107 plt.xlim(0,1300)
108 plt.ylim(0,20)
109 plt.title(房源户型分布情况)
110 plt.legend([数量], loc=upper right)
111 plt.grid(color=#95a5a6,linestyle=--, linewidth=1,axis=y,alpha=0.4)
112 plt.yticks(a,(1室0厅,1室1厅,1室2厅,2室0厅,2室1厅,2室2厅,3室0厅,3室1厅,3室2厅,3室3厅,4室1厅,4室2厅,4室3厅,5室2厅,5室3厅,6室1厅,6室2厅,7室2厅,7室3厅))
113 plt.show()
114 
115 #对房源面积进行二次分列
116 mianji_num_split = pd.DataFrame((x.split() for x in house.mianji),index=house.index,columns=[mianji_num,mi])
117 #将分列后的房源面积拼接回原数据表
118 house=pd.merge(house,mianji_num_split,right_index=True, left_index=True)
119 
120 #去除mianji_num字段两端的空格
121 #house[‘mianji_num‘]=house[‘mianji_num‘].map(str.strip)
122 
123 #更改mianji_num字段格式为float
124 house[mianji_num]=house[mianji_num].astype(float)
125 
126 #查看所有房源面积的范围值
127 house[mianji_num].min(),house[mianji_num].max()
128 (18.850000000000001, 332.63)
129 
130 
131 #对房源面积进行分组
132 bins = [0, 50, 100, 150, 200, 250, 300, 350]
133 group_mianji = [小于50, 50-100, 100-150, 150-200,200-250,250-300,300-350]
134 house[group_mianji] = pd.cut(house[mianji_num], bins, labels=group_mianji)
135 
136 #按房源面积分组对房源数量进行汇总
137 group_mianji=house.groupby(group_mianji)[group_mianji].agg(len)
138 
139 #绘制房源面积分布图
140 plt.rc(font, family=STXihei, size=15)
141 a=np.array([1,2,3,4,5,6,7])
142 plt.barh([1,2,3,4,5,6,7],group_mianji,color=#052B6C,alpha=0.8,align=center,edgecolor=white)
143 plt.ylabel(面积分组)
144 plt.xlabel(数量)
145 plt.title(房源面积分布)
146 plt.legend([数量], loc=upper right)
147 plt.grid(color=#95a5a6,linestyle=--, linewidth=1,axis=y,alpha=0.4)
148 plt.yticks(a,(小于50, 50-100, 100-150, 150-200,200-250,250-300,300-350))
149 plt.show()
150 
151 #对房源关注度进行二次分列
152 guanzhu_num_split = pd.DataFrame((x.split() for x in house.guanzhu),index=house.index,columns=[guanzhu_num,ren])
153 #将分列后的关注度数据拼接回原数据表
154 house=pd.merge(house,guanzhu_num_split,right_index=True, left_index=True)
155 #去除房源关注度字段两端的空格
156 house[guanzhu_num]=house[guanzhu_num].map(str.strip)
157 #更改房源关注度及总价字段的格式
158 house[[guanzhu_num,totalprice]]=house[[guanzhu_num,totalprice]].astype(float)
159 
160 #查看房源关注度的区间
161 house[guanzhu_num].min(),house[guanzhu_num].max()
162 (0.0, 725.0)
163 
164 #对房源关注度进行分组
165 bins = [0, 100, 200, 300, 400, 500, 600, 700,800]
166 group_guanzhu = [小于100, 100-200, 200-300, 300-400,400-500,500-600,600-700,700-800]
167 house[group_guanzhu] = pd.cut(house[guanzhu_num], bins, labels=group_guanzhu)
168 group_guanzhu=house.groupby(group_guanzhu)[group_guanzhu].agg(len)
169 
170 #绘制房源关注度分布图
171 plt.rc(font, family=STXihei, size=15)
172 a=np.array([1,2,3,4,5,6,7,8])
173 plt.barh([1,2,3,4,5,6,7,8],group_guanzhu,color=#052B6C,alpha=0.8,align=center,edgecolor=white)
174 plt.ylabel(关注度分组)
175 plt.xlabel(数量)
176 plt.xlim(0,3000)
177 plt.title(房源关注度分布)
178 plt.legend([数量], loc=upper right)
179 plt.grid(color=#95a5a6,linestyle=--, linewidth=1,axis=y,alpha=0.4)
180 plt.yticks(a,(小于100, 100-200, 200-300, 300-400,400-500,500-600,600-700,700-800))
181 plt.show()
182 
183 #导入sklearn中的KMeans进行聚类分析
184 from sklearn.cluster import KMeans
185 #使用房源总价,面积和关注度三个字段进行聚类
186 house_type = np.array(house[[totalprice,mianji_num,guanzhu_num]])
187 #设置质心数量为3
188 clf=KMeans(n_clusters=3)
189 #计算聚类结果
190 clf=clf.fit(house_type)
191 
192 #查看分类结果的中心坐标
193 clf.cluster_centers_array([[ 772.97477064, 112.02389908, 58.96330275],[ 434.51073861, 84.92950236, 61.20115244],[ 1473.26719577, 170.65402116, 43.32275132]])
194 
195 #在原数据表中标注所属类别
196 house[label]= clf.labels_

 

使用python抓取并分析北京链家地产二手房信息