一个爬取百度图库程序

首页 > 代码库 > 一个爬取百度图库程序

2024-09-16 23:59:59 208人阅读

学习python有一段时间了这几天想写一个爬去百度图片的小爬虫
代码
from selenium import webdriver
import urllib,re
import time
import urllib2
import sys
import os
import socket
import threading
socket.setdefaulttimeout(15.0)
def mkdir(name): #判断文件存放的目录是否存在
    if not os.path.exists(name):
        os.mkdir(name)
def get_html(name,papg):#通过selenium+PhantomJS来访问目标网址
        try: #异常处理
            name = urllib.quote(name)
            driver=webdriver.PhantomJS()
            driver.get(‘https://image.baidu.com/search/index?tn=baiduimage&word={}&pn={}‘.format(name,papg))
            data=http://www.mamicode.com/driver.page_source
            driver.quit()
            return data
        except Exception:
            return None
def req(html):#抓取图片的正则表达式
    try:
        s=r‘data-objurl="(http://.*?)"‘
        req=re.findall(s,html)
        return req
    except Exception:
        return None
def Loadown(req):#下载函数
    for i in req:
        try:
            #heard={‘User-Agent‘:"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
            #urllib2.Request.add_header(‘User-Agent‘,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0")
            if urllib2.urlopen(i).getcode()==200:
                print i
                urllib.urlretrieve(i,name+‘/%s‘% len(os.listdir(name)))
            else:
                pass
        except Exception :
            pass
def three(req):#线程函数
    threading.Thread(target=Loadown, args=(req,)).start()
    while (threading.activeCount() > 20):
        if (threading.activeCount() < 20):
            break;

def ForImg(papg):
    html = get_html(name, papg)
    res = req(html)
    if res != None:
        three(res)

#data-thumburl="https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2766886107,1571085905&fm=23&gp=0.jpg"
if __name__ == ‘__main__‘:
    print ‘----------------------------------------------------------------------------‘
    name = raw_input(‘请输入搜索的目标:‘).decode(sys.stdin.encoding)
    name = name.encode(‘utf-8‘)
    mkdir(name)
    s=raw_input(‘请输入需要几页数据‘)
    if s.isdigit():
        s=int(s)
    else:
        print ‘请输入数字‘
    papg=0
    for i in range(0,s):
        ForImg(papg)
        papg+=20

一个爬取百度图库程序

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 一个爬取百度图库程序

一个爬取百度图库程序

看完仍有疑问？有类似问题直接问程序猿