首页 > 代码库 > 爬虫网站

爬虫网站

# -*- coding:utf -8 -*-
import urllib2
import re
def getlist():
    html = urllib2.urlopen("http://www.quanshu.net/book/0/269/").read()
    reg = re.compile(r‘<li><a href="http://www.mamicode.com/(.*?)" title=".*?">(.*?)</a></li>‘)
    urls = re.findall(reg,html)
    return urls
def getcontent(url):
    html = urllib2.urlopen("http://www.quanshu.net/book/0/269/"+url).read()              #url为字符串要加到引号外边
    html = html.decode(‘gbk‘).encode(‘utf-8‘)            #decode("gdk")把decode编码转换为Unicode      #encode("utf-8")把Unicode编码转换为utf-8
    reg = re.compile(r‘</script>&nbsp;&nbsp;&nbsp;&nbsp(.*?)<script type="text/javascript">‘,re.S)    re.S换行
    content = re.findall(reg,html)[0]
    return  content
for i in getlist():
    content = getcontent(i[0])
    content = content.replace(‘<br /><br />&nbsp;&nbsp;&nbsp;&nbsp;‘,‘\r\n‘)    #\r\n换行

    try:
        with open(i[1]+‘.txt‘,‘wb‘) as f:          #w表示可写  b表示二进制
            f.write(content)
    except Exception,e:
        continue

爬虫网站