首页 > 代码库 > <双十一特辑> 模拟登录学校教务处爬取全校女生资料和头像

<双十一特辑> 模拟登录学校教务处爬取全校女生资料和头像

  1 #-*- coding=utf-8 -*- 
  2 import requests
  3 import re
  4 import json
  5 import time
  6 from PIL import Image
  7 import cStringIO
  8 import cookielib  
  9 import urllib
 10 import os
 11 import xlrd
 12 
 13 from requests.packages.urllib3.exceptions import InsecureRequestWarning,InsecurePlatformWarning
 14 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 15 requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
 16 
 17 data=http://www.mamicode.com/xlrd.open_workbook(1.xlsx)
 18 table=data.sheet_by_name(uSheet1)
 19 
 20 message_url=https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT
 21 login_url=https://matrix.dean.swust.edu.cn/cas/login
 22 topic_url=‘‘
 23 flag=0
 24 
 25 student = {}
 26 student = {
 27     学号:‘‘,
 28     姓名:‘‘,
 29     性别:‘‘,
 30     生日:‘‘,
 31     pic:‘‘,
 32     民族:‘‘,
 33     行政班:‘‘,
 34     专业:‘‘,
 35     }
 36 
 37 headers={
 38 User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36
 39 }
 40 
 41 session=requests.Session()
 42 session.headers=headers    
 43 session.cookies = cookielib.LWPCookieJar(filename=cookies) 
 44 # try:  
 45 #     session.cookies.load(ignore_discard=True)  
 46 # except:  
 47 #     print u"未登陆过,需先登录"  
 48 
 49 
 50 def get_lt(url="https://matrix.dean.swust.edu.cn/cas/login"):  
 51     ‘‘‘‘‘_lt 是一个动态变化的参数‘‘‘  
 52     global session
 53     index_url =  url
 54     index_page = session.get(index_url,verify=False)  
 55     html = index_page.content  
 56     pattern = rname="lt" type="hidden" value="http://www.mamicode.com/(.*?)"  
 57     lt = re.findall(pattern, html)
 58     return lt[0]
 59 
 60 def login(username,password):
 61     global session
 62     global topic_url
 63     global flag
 64     data=http://www.mamicode.com/{
 65     lt:get_lt(),
 66     username:username,
 67     password:password,
 68     service:https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT,
 69     }
 70     loginurl=login_url
 71     try:
 72         login_page=session.post(loginurl,data=http://www.mamicode.com/data)
 73         login_code=login_page.content
 74         pattern=r<a class="btn btn-primary" href="http://www.mamicode.com/(.*?)"
 75         real_url=re.findall(pattern, login_code)
 76         topic_url=real_url[0]
 77         flag=1
 78     except:
 79         print error01
 80     session.cookies.save()
 81     
 82 
 83 def isLogin():  
 84     global session
 85     url = "https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT"  
 86     login_code = session.get(url, allow_redirects=False).status_code  
 87     if int(x=login_code) == 200:  
 88         return True  
 89     else:  
 90         return False 
 91 
 92 def get_message():
 93     global session
 94     global topic_url
 95     global message_url
 96     global student
 97 
 98     html=session.get(topic_url)
 99     html=session.get(message_url).text
100 
101     pattern_ming=r<td>(.*?)</td>
102     pattern_id=r<span class="number">(.*?)</span>
103     pattern_pic=r<td style="padding:0;" width="135" height="180" valign="middle" align="center" rowspan="6"><img width="135" height="180" align="middle" src="http://www.mamicode.com/(.*?)" /></td>
104     message_name=re.findall(pattern_ming, html)
105     message_pic=re.findall(pattern_pic, html)
106     try:
107         student[学号]=re.findall(r<span class="number">(\d*?)</span>, message_name[2])[0]
108     except:
109         pass
110 
111     student[姓名]=message_name[4]
112     student[性别]=message_name[6]
113     student[专业]=message_name[37]
114     #student[‘生日‘]=re.findall(r‘<span class="number">(.*?)</span>‘, message_name[8])[0]
115     #student[‘民族‘]=message_name[10]
116     student[行政班]=message_name[27]
117     student[pic]=https://matrix.dean.swust.edu.cn/acadmicManager/student/profile/+student[学号]+.jpg
118 
119 
120 def download():
121     global student
122     global session
123     basepath=os.path.abspath(.)
124     savepath=os.path.join(basepath,student[专业])
125     if not os.path.exists(savepath):
126         os.mkdir(savepath)
127     try:
128         picpath=os.path.join(savepath,student[姓名]+student[学号]+.jpg)
129         r=session.get(student[pic])
130         with open(picpath, "wb") as pic:
131             pic.write(r.content)
132         print u>>>>>>>>>成功抓取>>>>>>>>>>>>>>>+student[姓名]
133     except Exception, e:
134         pass
135     
136 
137 if __name__ == __main__:
138     count=table.nrows
139     i=2
140     while(count>0):
141         if(table.col_values(3)[i]==u):
142             try:
143                 login(str(int(table.col_values(1)[i])), str(table.col_values(13)[i])[11:17])
144             except:
145                 pass
146         if(flag==1):
147             get_message()
148             download()
149             flag=0
150         count=count-1
151         i=i+1
152         session.cookies.clear()
总结:
python处理excel>>  http://www.cnblogs.com/lhj588/archive/2012/01/06/2314181.html
session释放>>    
http://stackoverflow.com/questions/23816139/clear-cookies-from-requests-pytho
注明:
  1.xlsx为提供学生资料的excel
  异常处理之间的妥协关系需要事先计划好

<双十一特辑> 模拟登录学校教务处爬取全校女生资料和头像