首页 > 代码库 > Python学习(2)

Python学习(2)


爬取网页的部分链接

#!/usr/bin/python
#coding = utf8
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
pages = set()
def getlink(pageurl):
    global pages
    html = urlopen(‘http://www.ftchinese.com‘ + pageurl)
    bs_data = http://www.mamicode.com/BeautifulSoup(html,‘lxml‘)
#from ipdb import set_trace
#set_trace()
    for link in bs_data.find_all(‘a‘,href = http://www.mamicode.com/re.compile("^(/m/)")):
        if ‘href‘ in link.attrs:
            if link.attrs[‘href‘] not in pages:
            #我们遇到了新页面
                newpage = link.attrs[‘href‘]
                print(newpage)
                pages.add(newpage)
                getlink(newpage)
getlink("")



Python学习(2)