首页 > 代码库 > 爬虫下载脚本
爬虫下载脚本
download 文件
1 #!/usr/bin/python 2 #_*_coding:utf-8 _*_ 3 import urlparse 4 import urllib2 5 import random 6 import time 7 from datetime import datetime, timedelta 8 import socket 9 import disk_cache 10 DEFAULT_AGENT=‘WSWP‘ # 设置代理 11 DEFAULT_DELAY=5 #设置下载延迟 为了限制下载速度 12 DEFAULT_RETRIES=1#发生错误时候尝试的次数 13 DEFAULT_TIMEOUT=60 14 CACHE=disk_cache.DiskCache() 15 class Downloader: 16 def __init__ (self ,delay=DEFAULT_DELAY,user_agent=DEFAULT_AGENT, proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, opener=None,cache=CACHE ): 17 socket.setdefaulttimeout(timeout) 18 self.throttle=Throttle(delay) 19 self.user_agent=user_agent 20 self.proxies=proxies 21 self.num_retries=num_retries 22 self.opener=opener 23 self.cache=cache 24 25 26 def __call__(self,url): 27 result=None 28 print self.cache 29 if self.cache: 30 try: 31 print ‘women doushi‘ 32 result=self.cache[url] 33 print result 34 print ‘123‘ 35 except KeyError: 36 pass 37 else: 38 if self.num_retries>0 and 500< result[‘code‘]<600: 39 result=None 40 if result is None: 41 self.throttle.wait(url) 42 proxy=random.choice(self.proxies) if self.proxies else None 43 headers={‘User-agent‘:self.user_agent} 44 result=self.download(url,headers, proxy=proxy,num_retries=self.num_retries) 45 if self.cache: 46 self.cache[url]=result 47 48 return result[‘html‘] 49 50 def download(self,url, headers,proxy, num_retries, data=http://www.mamicode.com/None): 51 print ‘Downloading:‘, url 52 request=urllib2.Request(url,data,headers or {}) 53 opener=self.opener or urllib2.build_opener() 54 if proxy: 55 proxy_params={urlparse.urlparse(url).scheme:proxy} 56 opener.add_handler(urllib2.ProxyHandler(proxy_params)) 57 58 59 try: 60 61 62 response=opener.open(request) 63 html=response.read() 64 code=response.code 65 66 except Exception as e : 67 print ‘Download error:‘, str(e) 68 69 html=‘‘ 70 71 if num_retries>0 and 500<=code<600: 72 return self._get(url,headers, prox,num_retries-1,data) 73 74 75 else: 76 code=None 77 return {‘html‘:html,‘code‘:code} 78 79 class Throttle: 80 def __init__ (self,delay): 81 self.delay=delay 82 self.domains={} 83 def wait(self,url): 84 domain=urlparse.urlsplit(url).netloc 85 last_accessed=self.domains.get(domain) 86 if self.delay>0 and last_accessed is not None: 87 sleep_secs=self.delay-(datetime.now()-last_accessed).seconds 88 if sleep_secs >0: 89 time.sleep(sleep_secs) 90 self.domains[domain]=datetime.now() 91 p=Downloader() 92 x=p(‘http://www.meituan.com‘)
缓存disk_cache 脚本
1 import os 2 import re 3 import urlparse 4 import shutil 5 import zlib 6 from datetime import datetime, timedelta 7 try: 8 import cPickle as pickle 9 except ImportError: 10 import pickle 11 12 13 14 class DiskCache: 15 """ 16 Dictionary interface that stores cached 17 values in the file system rather than in memory. 18 The file path is formed from an md5 hash of the key. 19 20 >>> cache = DiskCache() 21 >>> url = ‘http://example.webscraping.com‘ 22 >>> result = {‘html‘: ‘...‘} 23 >>> cache[url] = result 24 >>> cache[url][‘html‘] == result[‘html‘] 25 True 26 >>> cache = DiskCache(expires=timedelta()) 27 >>> cache[url] = result 28 >>> cache[url] 29 Traceback (most recent call last): 30 ... 31 KeyError: ‘http://example.webscraping.com has expired‘ 32 >>> cache.clear() 33 """ 34 35 def __init__(self, cache_dir=‘cache‘, expires=timedelta(days=30), compress=True): 36 """ 37 cache_dir: the root level folder for the cache 38 expires: timedelta of amount of time before a cache entry is considered expired 39 compress: whether to compress data in the cache 40 """ 41 self.cache_dir = cache_dir 42 self.expires = expires 43 self.compress = compress 44 45 46 def __getitem__(self, url): 47 """Load data from disk for this URL 48 """ 49 path = self.url_to_path(url) 50 if os.path.exists(path): 51 with open(path, ‘rb‘) as fp: 52 data =http://www.mamicode.com/ fp.read() 53 if self.compress: 54 data =http://www.mamicode.com/ zlib.decompress(data) 55 result, timestamp = pickle.loads(data) 56 if self.has_expired(timestamp): 57 raise KeyError(url + ‘ has expired‘) 58 return result 59 else: 60 # URL has not yet been cached 61 raise KeyError(url + ‘ does not exist‘) 62 63 64 def __setitem__(self, url, result): 65 """Save data to disk for this url 66 """ 67 path = self.url_to_path(url) 68 folder = os.path.dirname(path) 69 if not os.path.exists(folder): 70 os.makedirs(folder) 71 72 data =http://www.mamicode.com/ pickle.dumps((result, datetime.utcnow())) 73 if self.compress: 74 data =http://www.mamicode.com/ zlib.compress(data) 75 with open(path, ‘wb‘) as fp: 76 fp.write(data) 77 78 79 def __delitem__(self, url): 80 """Remove the value at this key and any empty parent sub-directories 81 """ 82 path = self._key_path(url) 83 try: 84 os.remove(path) 85 os.removedirs(os.path.dirname(path)) 86 except OSError: 87 pass 88 89 90 def url_to_path(self, url): 91 """Create file system path for this URL 92 """ 93 components = urlparse.urlsplit(url) 94 # when empty path set to /index.html 95 path = components.path 96 if not path: 97 path = ‘/index.html‘ 98 elif path.endswith(‘/‘): 99 path += ‘index.html‘ 100 filename = components.netloc + path + components.query 101 # replace invalid characters 102 filename = re.sub(‘[^/0-9a-zA-Z\-.,;_ ]‘, ‘_‘, filename) 103 # restrict maximum number of characters 104 filename = ‘/‘.join(segment[:255] for segment in filename.split(‘/‘)) 105 return os.path.join(self.cache_dir, filename) 106 107 108 def has_expired(self, timestamp): 109 """Return whether this timestamp has expired 110 """ 111 return datetime.utcnow() > timestamp + self.expires 112 113 114 def clear(self): 115 """Remove all the cached values 116 """ 117 if os.path.exists(self.cache_dir): 118 shutil.rmtree(self.cache_dir) 119 120 121 122 if __name__ == ‘__main__‘: 123 cache=DiskCache() 124 print cache
爬虫下载脚本
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。