首页 > 代码库 > 抓取天涯文章的蜘蛛代码,刚经过更新(因为天涯页面HTML代码变化)

抓取天涯文章的蜘蛛代码,刚经过更新(因为天涯页面HTML代码变化)

#_*_coding:utf-8-*-import urllib2import tracebackimport codecsfrom BeautifulSoup import BeautifulSoupdef openSoup(url,code):    page = urllib2.urlopen(url)    soup = BeautifulSoup(page,fromEncoding=code)#,fromEncoding="gb2312"    #soup = BeautifulSoup(page,code)    return soupdef getContentFromDiv(contents):    s = ""    for content in contents:        try:            s += content        except:            pass        s = s.lstrip().rstrip()    if len(s) < 50:        return ""    else:        return "    "+s+"\r\n"+"\r\n"def readHtml(soup,fp,authname):    pageContent = ""    item = soup.find(name=div, attrs={class:bbs-content clearfix})    if item != None:        pageContent += getContentFromDiv(item.contents)    items = soup.findAll(name=div, attrs={class:atl-item})    for item in items:        userItem = item.find(name=a, attrs={class:js-vip-check})        if userItem == None or userItem.contents[0] != authname:            continue        contentItem = item.find(name=div, attrs={class:bbs-content})        pageContent += getContentFromDiv(contentItem.contents)        fp.write(pageContent)   def getNextPage(soup,pno):    nextlink = soup.find(name="a",attrs={"class":"js-keyboard-next"})    if nextlink != None:        return "http://bbs.tianya.cn"+nextlink["href"]    else:        return OVERdef getHtml(url,filename,authname):    p = 1    fp = codecs.open(filename,w,utf-8)    while True:        soup = openSoup(url,utf-8)        readHtml(soup,fp,authname)        url = getNextPage(soup,p+1)        if url == OVER :            break        print PAGE +str(p)+ OK        p = p + 1           print It\‘s Over    fp.close()if __name__ == __main__:    getHtml(http://bbs.tianya.cn/post-no05-143258-1.shtml,krzc.txt,u关河五十州)    #getHtml(‘http://bbs.tianya.cn/post-no05-143258-1036.shtml‘,‘krzc.txt‘,u‘关河五十州‘)