首页 > 代码库 > scrapy备注

scrapy备注

#coding=utf-8
import requests
from lxml import etree
import sys
import re
import csv
sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
proxies={‘http‘:‘http://lwx351192:lyp-2016@proxyhk.huawei.com:8080‘,
‘https‘: ‘https://lwx351192:lyp-2016@proxyhk.huawei.com:8080‘}
url=‘http://tieba.baidu.com/p/3138733512?see_lz=1&pn=1‘
# response=requests.get(url,proxies=proxies).content
# html=etree.HTML(response)
# page_num=html.xpath(‘//*[@id="thread_theme_5"]/div[1]/ul/li[2]/span[2]/text()‘)[0]
# print page_num
# # title=html.xpath(‘//*[@id="j_core_title_wrap"]/h3/text()‘)[0]
# content=html.xpath(‘//*[@id="j_p_postlist"]/div/div[2]/div[1]/cc/div/text()‘)
‘‘‘
test:
f=open(‘baitu_tieba.txt‘,‘a‘)
for i in range(1,6):
url=‘http://tieba.baidu.com/p/3138733512?see_lz=1&pn=1‘
response=requests.get(url,proxies=proxies).content
html=etree.HTML(response)
# page_num=html.xpath(‘//*[@id="thread_theme_5"]/div[1]/ul/li[1]/span/text()‘)[0]
# title=html.xpath(‘//*[@id="j_core_title_wrap"]/h3/text()‘)[0]
content=html.xpath(‘//*[@id="j_p_postlist"]/div/div[2]/div[1]/cc/div/text()‘)

f.writelines(content)
f.close()

‘‘‘
class bd_tb:
def __init__(self,base_url):
self.base_url=base_url

def get_page(self,page_num):
try:
url=self.base_url+‘&pn=‘+str(page_num)
response=requests.get(url,proxies=proxies).content
return response
except Exception,e:
print e
return None

def get_title(self,page):
# pattern = re.compile(‘<h3 class="core_title_tx.*?>(.*?)</h3>‘, re.S)
# result=re.search(pattern,page)
# print result.group(1)
html=etree.HTML(page)
title=html.xpath(‘//*[@id="j_core_title_wrap"]/h3/text()‘)[0]
print title
return title

def get_pagenum(self,page):
# pattern=re.compile(‘<li class="l_reply_num".*?</span>.*?<span.*?>(.*?)</span>‘,re.S)
# result=re.search(pattern,page)
# print result.group(1)
# return result.group(1).strip()
html=etree.HTML(page)
pagenum=html.xpath(‘//*[@id="thread_theme_5"]/div[1]/ul/li[2]/span[2]/text()‘)[0].strip()
return pagenum

#获取正文
def get_content(self,page):
# # pattern=re.compile(‘<div class="p_content.*?<cc.*?<div id="post_content_.*?>(.*?)</div>‘)
# pattern = re.compile(‘<div id="post_content_.*?>(.*?)</div>‘, re.S)
# items=re.findall(pattern,page)
# for item in items:
# print item
html = etree.HTML(page)
content=html.xpath(‘//*[@id="j_p_postlist"]/div/div[2]/div[1]/cc/div/text()‘)
contents=[]
for item in content:
contents.append(item.strip())
return contents
def set_file(self,title):
if title is not None:
self.file=open(title+‘.txt‘,‘a‘)
else:
pass
def write_data(self,contents):
# self.file.writelines(contents)
for item in contents:
self.file.writelines(item)
def start(self):
index_page=self.get_page(1)
page_num=self.get_pagenum(index_page)
title=self.get_title(index_page)
self.set_file(title)
try:
print ‘共有‘+str(page_num)+‘页‘
for i in range(1,int(page_num)+1):
print ‘正在写入‘+str(i)+‘页数据‘
page=self.get_page(i)
contents=self.get_content(page)
self.write_data(contents)
finally:
print ‘完成‘

base_url=‘http://tieba.baidu.com/p/3138733512?see_lz=1‘
bd=bd_tb(base_url)
bd.start()

scrapy备注