具体代码如下:import os

import sysimport timeimport urllib.requestfrom urllib.parse import urljoin,urlparsefrom bs4 import BeautifulSoupfrom threading import Thread‘‘‘class Download(Thread):                                               #多线程下载代码1. 为每一个图片分配一个下载线程    def __init__(self,url,filepath):        Thread.__init__(self)        self.url = url        self.filepath = filepath    def run(self):        length = 0        try:            opener = urllib.request.build_opener()            opener.addheaders = [(‘User-agent‘,‘Mozilla/5.0‘)]            urlhandle = opener.open(self.url,timeout = 30)                        urlhead = urlhandle.info()            if ‘Content-Length‘ in urlhead:                length = int(urlhead[‘Content-Length‘])             data = urlhandle.read(10*1024)            while data:                with open(self.filepath,‘ab+‘) as wf:                    wf.write(data)                data = urlhandle.read(10*1024)        except Exception as ex:            print(self.url | ‘\n‘ + ‘× ‘ + str(ex))            try:                os.remove(self.filepath)                with open(‘/home/maple/Desktop/bad‘,‘a‘) as badFile:            #超时未能完成下载则删除文件并将图片url记录到未下载链接列表中                    badFile.write(self.url+‘\n‘)            except:                pass‘‘‘def maple(root):    tasks = []                           #多线程集合    urls = [root]                        #待分析的网页链接    urld = []                            #已分析并完成图片下载的网页链接    if os.path.exists(tmpUrls):        #读取本地待分析和已分析网页链接数据        with open(tmpUrls,r) as urlsFile:            urls = urlsFile.readlines()        for url in urls:            if url[0] == ‘‘ or url[0] ==  :                urls.remove(url)        urls = [line[:-1] for line in urls]    if os.path.exists(tmpUrld):        with open(tmpUrld,r) as urldFile:            urld = urldFile.readlines()        for url in urld:            if url[0] == ‘‘ or url[0] ==  :                urld.remove(url)        urld = [line[:-1] for line in urld]    try:        times =3                #设置网页读取失败后重试的次数        while urls:            curl = urls.pop()            urld.append(curl)            print(=================== Current Page: +curl+ =======================)            try:                response = urllib.request.urlopen(curl,timeout = 5)                html = response.read()                data = html.decode(utf8)                soup = BeautifulSoup(data)        #使用BeautifulSoup获取网页元素集            except Exception as ex:               #读取网页失败,重试                print(ex)                if times > 0:                    urls.append(curl)                    urld.remove(curl)                    times -= 1                else:
if curl in urld:
urld.remove(curl) times = 3 continue path = /home/maple/Desktop/images/ count = 1 for list in soup.find_all(img): #获取网页中所有图片链接 width = 0 height = 0 dict = list.attrs if "src" in dict: image = dict[src] img = image[image.rfind(.):] if "alt" in dict: #该站点图片链接中提供的图片名属性,不同站点给出的属性可能不同甚至不一定给出图片名属性 fname = dict[alt] filepath=os.path.join(path,fname+img) else: filepath = os.path.join(path,str(count)+img) count +=1 if "width" in dict: #获取站点图片链接中提供的图片尺寸属性,width和height属性不一定给出 width = int(dict[width]) if "height" in dict: height = int(dict[height]) num=1 while os.path.exists(filepath): #如获取的图片名与本地图片重名则自动按序重命名 fname,fext=os.path.splitext(filepath) if (+str(num-1)+)+fext in filepath: filepath = filepath.replace((+str(num-1)+)+fext,(+str(num)+)+fext) else: fname += (+str(num)+) filepath = fname+fext num +=1 for i in range(0,3): #图片下载失败后重试(如使用多线程部分的代码则无此循环) try: if (width == 0 or width >= 250) or (height ==0 or height >= 350): length = 0 image_handle = urllib.request.urlopen(dict[src],timeout = 5+i*10) #每次重试的超时时间依次递增 image_head = image_handle.info() if Content-Length in image_head: #获取图片实际大小 length = int(image_head[Content-Length]) print(dict[src]+ ==== SIZE:{}*{} -- {}KB.format(width,height,length/1000)) if length > 20*1000: #只下载超过一定大小的图片,避免下载网页中的图标或者链接图 with open(filepath, wb) as file: image_data = image_handle.read() file.write(image_data) print() break ‘‘‘ task = Download(dict[‘src‘],filepath) #多线程下载代码2.为图片资源分配下载线程 task.setDaemon( True ) #将线程置为后台线程 task.start() tasks.append(task) #启动线程并将线程加入线程集合中 ‘‘‘ except Exception as ex: if i < 2: continue else: #重试3次后依然下载失败则将图片url记录到未下载列表中 print(× +str(ex)) try: os.remove(filepath) with open(/home/maple/Desktop/bad,a) as badFile: badFile.write(dict[src]+\n) except: pass continue ‘‘‘ if len(tasks) >= 10: while len([task for task in tasks if task.isAlive()]): time.sleep(2) tasks = [] ‘‘‘ for a in soup.find_all(a): #获取当前页面中所有的链接地址,未分析的网页链接入栈 dict = a.attrs if href in dict: url = dict[href] if urlparse(url)[1]: if urlparse(url)[1] == urlparse(curl)[1]: pass else: url = urljoin(curl,url) if url not in urls and url not in urld: urls.append(url) except KeyboardInterrupt as kbi: #键盘终端,按下<C-c>终止程序,将已分析和未分析链接地址记录到本地 with open(tmpUrls,w) as urlsFile: tmpList = [line + \n for line in urls] urlsFile.writelines(tmpList) with open(tmpUrld,w) as urldFile: tmpList = [line + \n for line in urld] urldFile.writelines(tmpList)if __name__ == __main__: print("""+++++++++++++++++++++++ version: python3.4+++++++++++++++++=++++ """) url = http://www.msnzx.com/‘ #示例站点(子页和图片太多,运行完成需要很长时间) maple(url)


这段代码某些细节部分是专门针对 http://www.msnzx.com/ 这个站点的,下载其他站点数据仅需要微调一下就行了。其中分析网页直接使用了强大的第三方模块BeautifulSoup4,方便快捷。下载图片部分的实方式实在太多,上述代码中包含了2种下载方式:

1、直接使用url.request读写流一次性下载,下载任意文件时程序都是阻塞的。这种方式适合下载size较小的图片。图片要么完全下载,要么完全不下载(得到的本地文件size = 0),网络条件不佳的时候可以捕获超时异常记录未成功下载的图片url。


