首页 > 代码库 > 一个爬取百度图库程序

一个爬取百度图库程序

学习python有一段时间了这几天想写一个爬去百度图片的小爬虫
代码
from selenium import webdriver
import urllib,re
import time
import urllib2
import sys
import os
import socket
import threading
socket.setdefaulttimeout(15.0)
def mkdir(name): #判断文件存放的目录是否存在
if not os.path.exists(name):
os.mkdir(name)
def get_html(name,papg):#通过selenium+PhantomJS来访问目标网址
try: #异常处理
name = urllib.quote(name)
driver=webdriver.PhantomJS()
driver.get(‘https://image.baidu.com/search/index?tn=baiduimage&word={}&pn={}‘.format(name,papg))
data=http://www.mamicode.com/driver.page_source
driver.quit()
return data
except Exception:
return None
def req(html):#抓取图片的正则表达式
try:
s=r‘data-objurl="(http://.*?)"‘
req=re.findall(s,html)
return req
except Exception:
return None
def Loadown(req):#下载函数
for i in req:
try:
#heard={‘User-Agent‘:"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
#urllib2.Request.add_header(‘User-Agent‘,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0")
if urllib2.urlopen(i).getcode()==200:
print i
urllib.urlretrieve(i,name+‘/%s‘% len(os.listdir(name)))
else:
pass
except Exception :
pass
def three(req):#线程函数
threading.Thread(target=Loadown, args=(req,)).start()
while (threading.activeCount() > 20):
if (threading.activeCount() < 20):
break;

def ForImg(papg):
html = get_html(name, papg)
res = req(html)
if res != None:
three(res)

#data-thumburl="https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2766886107,1571085905&fm=23&gp=0.jpg"
if __name__ == ‘__main__‘:
print ‘----------------------------------------------------------------------------‘
name = raw_input(请输入搜索的目标:‘).decode(sys.stdin.encoding)
name = name.encode(‘utf-8‘)
mkdir(name)
s=raw_input(请输入需要几页数据‘)
if s.isdigit():
s=int(s)
else:
print 请输入数字
papg=0
for i in range(0,s):
ForImg(papg)
papg+=20

一个爬取百度图库程序