首页 > 代码库 > python简单爬数据(这两个成功了)

python简单爬数据(这两个成功了)

这两个做的比较早,也比较幸运,成功做出来了,把代码扔在这里

老师叫我从两个网站上弄点数据,数据不多,但是要分月份,小时,经度,纬度,虽然不用排列组合还是挺麻烦的

人生苦短,我用Python

之前就大半年前看了看语法,没有实践过,数据的网页也比较简单,也算是拿来练练手


代码里面已经包含了目标网址,就不具体介绍了,保存下来的是网页,还需要一个脚本来处理一下,这个比较简单也就不放了。

1

#!usr/bin/python

import requests
import time
import sys


#-------- 配置以选择要爬的东西
#‘hour‘, ‘month‘, ‘latitude‘, ‘longitude‘
sel = longitude
#--------

web_url = rhttps://omniweb.gsfc.nasa.gov/vitmo/iri2012_vitmo.html #IRI2012
request_url = rhttps://omniweb.gsfc.nasa.gov/cgi/vitmo/vitmo_model.cgi
#filepath = sys.path[0] + ‘\\dataaa_‘ + time.strftime("%Y%m%d%H%M%S", time.localtime()) + ‘.txt‘
filepath = sys.path[0] + \\data_iri2012_raw_ + sel + .txt
print(filepath)
fid = open(filepath, w, encoding = utf-8)

headers = {#POST /cgi/vitmo/vitmo_model.cgi HTTP/1.1
           Host : omniweb.gsfc.nasa.gov,
           User-Agent : Mozilla/5.0 (Windows NT 6.1; rv:53.0) Gecko/20100101 Firefox/53.0,
           Accept : text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
           Accept-Language : zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3,
           Accept-Encoding : gzip, deflate, br,
           Content-Type : application/x-www-form-urlencoded,
           Content-Length : 452,
           Referer : https://omniweb.gsfc.nasa.gov/vitmo/iri2012_vitmo.html,
           Cookie : _ga=GA1.4.167527256.1494290145; _gid=GA1.4.2137494148.1494290145; _gat_GSA_ENOR0=1,
           Connection : keep-alive,
           Upgrade-Insecure-Requests : 1,
           Pragma : no-cache,
           Cache-Control : no-cache
           }

payload = {model : iri_2012,
           year : 2016,        #
           month : 12,         #
           day : 01,           #
           time_flag : 1,
           hour : 8,           #
           geo_flag : 0.,
           latitude : 50.,     #
           longitude : 40.,    #
           height : 100.,      #
           profile : 1,
           start : 100.,       #起始
           stop : 1000.,       #结束
           step : 50.,         #步长
           sun_n : ‘‘,
           ion_n : ‘‘,
           radio_f : ‘‘,
           radio_f81  :‘‘,
           htec_max : ‘‘,
           ne_top:0.,
           imap : 0.,
           ffof2 : 0.,
           ib0 : 2.,
           probab : 0.,
           fauroralb : 1.,
           ffoE : 1.,
           dreg : 0.,
           tset : 0.,
           icomp : 0.,
           nmf2 : 0.,
           hmf2 : 0.,
           user_nme : 0.,
           user_hme : 0.,
           format : 0,
           vars : [17, 19, 20, 21],#Ne,Tn,Ti,Te : 电子密度,中子温度,离子温度,电子温度
           linestyle : solid,
           charsize : ‘‘,
           symbol : 2,
           symsize : ‘‘,
           yscale : Linear,
           xscale : Linear,
           imagex : 640,
           imagey : 480
           }


payload[year] = 2016
payload[month] = 12
payload[day] = 01
payload[hour] = 8
payload[longitude] = 120
payload[latitude] = 60
payload[start] = 60
payload[stop] = 1000
payload[step] = 1

count = 0

hours = range(1, 25)
months = range(1, 13)
latitudes = range(-90, 100, 10)
longitudes = range(0, 360, 10)
dic = {hour : hours,
       month : months,
       latitude : latitudes,
       longitude : longitudes
       }

items = dic[sel]
itemname = sel

for item in items:
    payload[itemname] = str(item)
    fid.write(\n#=====================  + str(item) +  =====================\n)
    TOGET = True
    while TOGET:
        TOGET = True
        try:
            print(\n=====================  + str(item) +  =====================\n)
            count = count + 1
            print(count :  + str(count))
            r = requests.post(request_url, data = http://www.mamicode.com/payload, headers = headers)#这里如果出错是不会向下执行的
            fid.write(r.text)
            TOGET = False
        except Exception as e:
            print(e)
            TOGET = True
    fid.write(\n---------------------  + str(item) +  ---------------------\n)

fid.close();

2

#!usr/bin/python

import requests
import time
import sys


#-------- 配置以选择要爬的东西
#‘hour‘, ‘month‘, ‘latitude‘, ‘longitude‘
sel = longitude
#--------

web_url = rhttps://ccmc.gsfc.nasa.gov/modelweb/models/nrlmsise00.php 
request_url = rhttps://ccmc.gsfc.nasa.gov/cgi-bin/modelweb/models/vitmo_model.cgi
#filepath = sys.path[0] + ‘\\dataaa_‘ + time.strftime("%Y%m%d%H%M%S", time.localtime()) + ‘.txt‘
filepath = sys.path[0] + \\data_nrmlsise_raw_ + sel + .txt
print(filepath)
fid = open(filepath, w, encoding = utf-8)


headers = {#POST /cgi-bin/modelweb/models/vitmo_model.cgi HTTP/1.1
           Host: ccmc.gsfc.nasa.gov,
           User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:53.0) Gecko/20100101 Firefox/53.0,
           Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
           Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3,
           Accept-Encoding: gzip, deflate, br,
           Content-Type: application/x-www-form-urlencoded,
           Content-Length: 296,
           Referer: https://ccmc.gsfc.nasa.gov/modelweb/models/nrlmsise00.php,
           Cookie: __utma=35212851.490003371.1494462808.1494462808.1494462808.1; __utmb=35212851.12.10.1494462808; __utmc=35212851; __utmz=35212851.1494462808.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1,
           Connection: keep-alive,
           Upgrade-Insecure-Requests: 1,
           Cache-Control: max-age=0
           }

payload = {model : nrlmsise,
           year : 2016,
           month : 12,
           day : 01,
           time_flag : 1,
           hour : 8,
           geo_flag : 0.,
           latitude : 60,
           longitude : 120,
           height : 100.,
           profile : 1,
           start : 60.,
           stop : 1000.,
           step : 10.,
           f10_7 : ‘‘,
           f10_7_3 : ‘‘,
           ap : ‘‘,
           format : 0,
           vars : [08, 09, 10],#O,N2,O2 : 氧原子,氮分子,氧分子
           linestyle : solid,
           charsize : 1.0,
           symbol : 2,
           symsize : 1.0,
           yscale : Lin,
           xscale : Lin,
           imagex : 640,
           imagey : 480,
           }

payload[year] = 2016
payload[month] = 12
payload[day] = 01
payload[hour] = 8
payload[longitude] = 120
payload[latitude] = 60
payload[start] = 60
payload[stop] = 1000
payload[step] = 1

count = 0

hours = range(1, 25)
months = range(1, 13)
latitudes = range(-90, 100, 10)
longitudes = range(0, 360, 10)
dic = {hour : hours,
       month : months,
       latitude : latitudes,
       longitude : longitudes
       }

items = dic[sel]
itemname = sel

for item in items:
    payload[itemname] = str(item)
    fid.write(\n#=====================  + str(item) +  =====================\n)
    TOGET = True
    while TOGET:
        TOGET = True
        try:
            print(\n=====================  + str(item) +  =====================\n)
            count = count + 1
            print(count :  + str(count))
            r = requests.post(request_url, data = http://www.mamicode.com/payload, headers = headers)#这里如果出错是不会向下执行的
            fid.write(r.text)
            TOGET = False
        except Exception as e:
            print(e)
            TOGET = True
    fid.write(\n---------------------  + str(item) +  ---------------------\n)

fid.close();

 

python简单爬数据(这两个成功了)