首页 > 代码库 > 用Python写了个简单小爬虫

用Python写了个简单小爬虫

 1 #coding: utf-8 2  3 import requests,urllib2,urlparse,time,sys 4 from bs4 import BeautifulSoup 5  6 #获取百度url wd查询关键子 pn第几页  rn 每页多少条 最大50条 7 #getbaidu.py 关键字 爬取页数 每页数量 8 #getbaidu.py example 10 50 9 #百度特性 每页显示条数为10 20 30类型的整数10 11 12 type = sys.getfilesystemencoding()13 14 def geturl(wd,pn,rn):15     #geturl(关键字,爬取页数,单页显示数量)16 17     #配置18     target = http://www.baidu.com/s19     i = pn #记录页数20     pn = pn*rn-1 #显示至当前页最后一个标题21     url = target+?wd=+wd+&pn=+str(pn)+&rn=+str(rn)22 23     #获取页面主体并分析24     body = getbody(url)25     soup = BeautifulSoup(body,html.parser)26 27     #分析28     tags = soup.find_all(a,attrs={class:c-showurl})29     links = []30     for tag in tags:31         href = http://www.mamicode.com/tag.attrs[href]32         #打印跳转后的url33         #print urlparse.urlparse(requests.head(href).headers[‘Location‘]).netloc34         href = http://www.mamicode.com/requests.head(href).headers[Location]35         links.append(href)36         print href37 38     #判断最终页39     if int(i) == 1: 40         print \r--------     Web spider is end     -------\r41 42     else:43         print \n#############     Next page,sleep 1 sencound    ##################            +str(i)+   \n44         time.sleep(1)45         geturl(wd,int(i)-1,rn)46 47 48 def getbody(url):49     request = urllib2.Request(url)50     response = urllib2.urlopen(request)51     return response.read()52 53 if __name__ == __main__:54     key = str(sys.argv[1])55     page_num = int(sys.argv[2])56     show_num = int(sys.argv[3])57     geturl(key,page_num,show_num)

配合着sqlmap api一起用,批量检测sql注入应该会很棒~

用Python写了个简单小爬虫