首页 > 代码库 > Python爬取全国历史天气数据

Python爬取全国历史天气数据

1、通过爬取历史首页,来获取城市地址和历史时间,构建链接;

技术分享
‘‘‘
获取全国的城市名称和链接
‘‘‘

import requests
from lxml import etree
import random
import pymongo
from time_list import get_time

client = pymongo.MongoClient(localhost,27017)
tianqi_data = client[tianqi_data]
time_url_table = tianqi_data[time_url_table]

headers_data = [
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36,
    Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393,
    Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0,
]
headers = {
    User-Agent:random.choice(headers_data)
}

def get_cityname(url):     #爬取城市名称,并保存到数据到列表中
    city_name_list  = []
    city_response = requests.get(url,headers = headers)
    city_response.encoding = city_response.apparent_encoding
    city_names = etree.HTML(city_response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li/a/text())
    city_links = etree.HTML(city_response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li/a/@href)
    for city_name,city_link in zip(city_names,city_links):
        if len(city_name) == 1:
            pass
        else:
            city_data = {
                city_name:str(city_name),
                city_link:str(city_link),
            }
            city_name_list.append(city_data)
    return city_name_list
    #print(city_name_list)
    print(获取城市名称和链接结束...)

url = http://lishi.tianqi.com/
for link in get_cityname(url):      #构建每个城市的历史日期链接,并保存到数据库中
    url = link[city_link]
    for time_link in get_time():
        time = time_link.split(/)[-1].split(.)[0]
        time_url = url.replace(index,str(time))
        data = {
            time_url:time_url,
            city:link[city_name],
        }
        print(data)
        time_url_table.insert(data)
print(导入数据库存完成)
View Code
技术分享
import requests
from lxml import etree

‘‘‘
通过对比城市的链接和历史时间的链接发现,就是在把城市链接里面的index换成了相对应的时间,
所以只要把index换成了历史月份就可以了
‘‘‘

def get_time():
    url = http://lishi.tianqi.com/acheng/index.html
    response = requests.get(url)
    time_lists = etree.HTML(response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li/a/@href)
    return time_lists
View Code

2、从数据库中读取数据,爬取每个城市的历史天气数据;

技术分享
import requests
from lxml import etree
import random
import pymongo

client = pymongo.MongoClient(localhost,27017)
tianqi_data = client[tianqi_data]
time_url_table = tianqi_data[time_url_table]
tianqi_data_table = tianqi_data[tianqi_data_table]

headers_data = [
    Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36,
    Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393,
    Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0,
]
headers = {
    User-Agent:random.choice(headers_data)
}

def get_tianqi_data():
    for link in time_url_table.find():
        url = link[time_url]
        print(url)
        response = requests.get(url,headers=headers)
        dates = etree.HTML(response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li/a/text())
        max_temps = etree.HTML(response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li[2]/text())[1:-1]
        low_temps = etree.HTML(response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li[3]/text())[1:-1]
        weathers = etree.HTML(response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li[4]/text())[1:-1]
        fengxiangs = etree.HTML(response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li[5]/text())[1:-1]
        fenglis = etree.HTML(response.text).xpath(//*[@id="tool_site"]/div[2]/ul/li[6]/text())[1:-1]
        for date,max_temp,low_temp,weather,fengxiang,fengli in zip(dates,max_temps,low_temps,weathers,fengxiangs,fenglis):
            data = {
                日期:date,
                最高温度:max_temp,
                最低温度:low_temp,
                天气:weather,
                风向:fengxiang,
                风力:fengli,
            }
            tianqi_data_table.insert(data)
            print(data)
    print(爬取数据成功)
View Code

 

Python爬取全国历史天气数据