首页 > 代码库 > 人民法院重大事件抓取

人民法院重大事件抓取

时间:2017-8-3 23:30

Url:http://www.court.gov.cn

py3.4 + mysql + win7

import urllib.request
import re
import pymysql
from time import sleep
try:
    con = pymysql.connect(host = 127.0.0.1,user = root,passwd=root)
    con.query(create database PeopleCourt)
    con = pymysql.connect(host = 127.0.0.1,user = root,passwd=root,db = PeopleCourt)
except:
    con = pymysql.connect(host = 127.0.0.1,user = root,passwd=root,db = PeopleCourt)
try:
    con.query(create TABLE lawcase(title char(100),url char(100),time char(50)))
except:
    print(Table existed)

url_row = http://www.court.gov.cn/fabu-gengduo-15.html?page=1
header = {User-Agent:Mozilla/5.0}
req = urllib.request.Request(url_row,headers=header)
res = urllib.request.urlopen(req)
data = res.read().decode()
reg_page = re.compile(<li class="last"><a href="http://www.mamicode.com/fabu-gengduo-15/.html/?page=(.*?)">).findall(data)
print(page:+str(reg_page[0]))
for page in range(1,int(reg_page[0])+1):
    print(Grab page:+str(page))
    url = http://www.court.gov.cn/fabu-gengduo-15.html?page=+str(page)
    req = urllib.request.Request(url,headers=header)
    res = urllib.request.urlopen(req)
    data = res.read().decode()
    reg_item_string = <a title="(.*?)" target="_blank" href="http://www.mamicode.com/(.*?)">.*?</a>.*?<i class="date">(.*?)</i>
    reg_item = re.compile(reg_item_string,re.S).findall(data)
    for item in reg_item:
        title = item[0].replace(\n,‘‘)
        Url = http://www.court.gov.cn+item[1]
        time = item[2]
        sql = "insert INTO lawcase(title,url,time) VALUES (‘"+title+"‘,‘"+Url+"‘,‘"+time+"‘)"
        con.query(sql)
    sleep(2)
print(Ok)

数据库截图:

技术分享

 

人民法院重大事件抓取