首页 > 代码库 > python多线程下载文件

python多线程下载文件

从文件中读取图片url和名称,将url中的文件下载下来。文件中每一行包含一个url和文件名,用制表符隔开。

1、使用requests请求url并下载文件

def download(img_url, img_name):
    with closing(requests.get(img_url, stream=True)) as r:
        with open(os.path.join(out_dir, img_name), wb) as f:
            for data in r.iter_content(1024):
                f.write(data)

 

2、从文件中读取url,考虑文件较大,使用生成器的方式读取。

def get_imgurl_generate():
    with open(./example.txt, r) as f:
        for line in f:
            line = line.strip()
            yield imgs

 

3、使用多线程进行下载

lock = threading.Lock()
def loop(imgs):
    while True:
        try:
            with lock:
                img_url, img_name = next(imgs)
        except StopIteration:
            break
        download_pic(img_url, img_name)

img_gen = imgurl_generate()

for i in range(0, thread_num):
    t = threading.Thread(target=loop, args=(img_gen,))
    t.start()

 

完整代码,加入异常处理

技术分享
 1 # -*- coding: utf-8 -*-
 2 import os
 3 from contextlib import closing
 4 import threading
 5 import requests
 6 import time
 7 
 8 
 9 headers = {
10 User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36
11 }
12 
13 #输出文件夹
14 out_dir = ./output
15 #线程数
16 thread_num = 20
17 #http请求超时设置
18 timeout = 5
19 
20 if not os.path.exists(out_dir):
21     os.mkdir(out_dir)
22 
23 
24 
25 def download(img_url, img_name):
26     if os.path.isfile(os.path.join(out_dir, img_name)):
27         return
28     with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
29         rc = r.status_code
30         if 299 < rc or rc < 200:
31             print returnCode%s\t%s % (rc, img_url)
32             return
33         content_length = int(r.headers.get(content-length, 0))
34         if content_length == 0:
35             print size0\t%s % img_url
36             return
37         try:
38             with open(os.path.join(out_dir, img_name), wb) as f:
39                 for data in r.iter_content(1024):
40                     f.write(data)
41         except:
42             print savefail\t%s % img_url
43 
44 def get_imgurl_generate():
45     with open(./final.scp, r) as f:
46         index = 0
47         for line in f:
48             index += 1
49             if index % 500 == 0:
50                 print execute %s line at %s % (index, time.time())
51             if not line:
52                 print urline %s is empty "\t" % index
53                 continue
54             line = line.strip()
55             try:
56                 imgs = line.split(\t)
57                 if len(imgs) != 2:
58                     print urline %s splite error % index
59                     continue
60                 if not imgs[0] or not imgs[1]:
61                     print urline %s img is empty % index
62                     continue
63                 yield imgs
64             except:
65                 print urline %s can not split by "\t" % index
66 
67 
68 lock = threading.Lock()
69 def loop(imgs):
70     print thread %s is running... % threading.current_thread().name
71 
72     while True:
73         try:
74             with lock:
75                 img_url, img_name = next(imgs)
76         except StopIteration:
77             break
78         try:
79             download(img_url, img_name)
80         except:
81             print exceptfail\t%s % img_url
82     print thread %s is end... % threading.current_thread().name
83 
84 img_gen = get_imgurl_generate()
85 
86 for i in range(0, thread_num):
87     t = threading.Thread(target=loop, name=LoopThread %s % i, args=(img_gen,))
88     t.start()
View Code

 

python多线程下载文件