首页 > 代码库 > 大学排名爬取

大学排名爬取

逻辑思路是什么?

  1. 获取页面

  2. 处理页面,提取信息

  3. 格式输出

先走面向过程编程:

  1. 要定义3个函数,对应以上三个过程

  2. 在__main__函数中传入参数,并执行以上三个过程

 

#!/usr/bin/python3
import bs4
import requests
from bs4 import BeautifulSoup


def getHTMLText(url):
    ‘‘‘获取页面‘‘‘
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def fillUnivList(ulist, html):
    ‘‘‘处理页面‘‘‘
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find(‘tbody‘).children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr(‘td‘)
            ulist.append([tds[0].string, tds[1].string, tds[3].string])


def printUnivList(ulist, num):
    ‘‘‘格式输出页面‘‘‘
    tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
    print(tplt.format("排名", "学校名称", "总分", chr(12288)))
    for i in range(num):
        u = ulist[i]
        print(tplt.format(u[0], u[1], u[2], chr(12288)))


if __name__ == ‘__main__‘:
    uinfo = []
    url = ‘http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html‘
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 20)                #  输出20个大学排名 

如何走向面向对象?

  1. 输入: url  获得多少条信息

  2. 输出: 格式化信息

  3. 对于获取页面和处理页面为私有方法,不应该暴露

#!/usr/bin/python3
import requests
import bs4
from bs4 import BeautifulSoup


class SchoolMessage(object):
    ‘‘‘爬取大学排名‘‘‘

    def __init__(self, url, number):
        self.url = url
        self.number = number

    def __get_html(self):
        ‘‘‘获得页面‘‘‘
        try:
            r = requests.get(self.url,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ‘1‘

    def __get_message(self):
        ‘‘‘获得信息‘‘‘
        info = []
        html = self.__get_html()
        if html is not ‘1‘:
            soup = BeautifulSoup(html, ‘html.parser‘)
            for i in soup.find(‘tbody‘).children:
                if isinstance(i, bs4.element.Tag):
                    tds = i(‘td‘)
                    info.append([tds[0].string, tds[1].string, tds[2].string])
            return info
        else:
            return ‘1‘

    def get_message(self):
        ‘‘‘格式化输出信息‘‘‘
        info = self.__get_message()
        if info is not ‘1‘:
            temp = "{0:^10}\t{1:{3}^10}\t{2:^10}"
            print(temp.format("排名", "学校名称", "总分", chr(12288)))
            for i in range(self.number):
                u = info[i]
                print(temp.format(u[0], u[1], u[2], chr(12288)))
        else:
            print(‘爬取失败‘)

if __name__ == ‘__main__‘:
    url = ‘http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html‘
    school_1 = SchoolMessage(url, 10)
    school_1.get_message()

 

大学排名爬取