首页 > 代码库 > 爬虫网站
爬虫网站
# -*- coding:utf -8 -*-
import urllib2
import re
def getlist():
html = urllib2.urlopen("http://www.quanshu.net/book/0/269/").read()
reg = re.compile(r‘<li><a href="http://www.mamicode.com/(.*?)" title=".*?">(.*?)</a></li>‘)
urls = re.findall(reg,html)
return urls
def getcontent(url):
html = urllib2.urlopen("http://www.quanshu.net/book/0/269/"+url).read() #url为字符串要加到引号外边
html = html.decode(‘gbk‘).encode(‘utf-8‘) #decode("gdk")把decode编码转换为Unicode #encode("utf-8")把Unicode编码转换为utf-8
reg = re.compile(r‘</script>  (.*?)<script type="text/javascript">‘,re.S) re.S换行
content = re.findall(reg,html)[0]
return content
for i in getlist():
content = getcontent(i[0])
content = content.replace(‘<br /><br /> ‘,‘\r\n‘) #\r\n换行
try:
with open(i[1]+‘.txt‘,‘wb‘) as f: #w表示可写 b表示二进制
f.write(content)
except Exception,e:
continue
爬虫网站