文本分类
2024-09-14 07:26:21 217人阅读
#-*- coding:utf-8 -*-
import re
import requests
class Spider:
#页面初始化
def __init__(self):
self.url = ‘http://gz.meituan.com/category/meishi?mtt=1.index%2Ffloornew.nc.1.irj38puy‘
def get_class_index(self):
r = requests.get(self.url)
#print r.encoding
#print r.text.encode(‘UTF-8‘)
pattern1 = re.compile(r‘<div class="label has-icon">分类:</div>(.*?)</div>‘,re.S)
items = re.findall(pattern1,r.text.encode(‘UTF-8‘))
pattern2 = re.compile(r‘<li.*?href="http://www.mamicode.com/(.*?)">(.*?)</a></li>‘,re.S)
items2 = re.findall(pattern2,items[0])
f = open(‘mt_class_index.txt‘,"w+")
llink=[]
lname=[]
lclass=[]
for i in range(len(items2)):
x,y=items2[i]
llink.append(x)
lname.append(y)
lclass.append(str(i+1))
f.write(str(i+1)+‘,‘+x+‘,‘+y+‘\n‘)
f.close()
result=zip(lclass,llink,lname)
return result
def getEverryClass(self,link):
r = requests.get(link)
pattern1 = re.compile(r‘<div class="paginator-wrapper">(.*?)</div>‘,re.S)
items = re.findall(pattern1,r.text.encode(‘UTF-8‘))
pattern2 = re.compile(r‘<li.*?href="http://www.mamicode.com/(.*?)".*?</li>‘,re.S)
pattern3 = re.compile(r‘<i class="icon icon-shangjia">.*?<a class="link f3 J-mtad-link".*?target="_blank">(.*?)</a>‘