首页 > 代码库 > 爬虫实例(二):多线程,多进程对网页的爬取

爬虫实例(二):多线程,多进程对网页的爬取

采用多线程对韩寒的微博进行爬取,这个只是不需要进行模拟登陆的:

 1 #--coding:utf-8--- 2 #!/usr/bin/env python 3 import urllib 4 import os 5 import re 6 import time 7 from threading import Thread 8 from multiprocessing import Process 9 10 def downloadURL(urls,dirpath):11     ##在之前中分装一个list12     for url in urls:13         if(len(url)>0):14             content=urllib.urlopen(url).read()15             ##采用os模块中IO接口写写html文档16             if not os.path.exists(dirpath):17                 os.makedirs(dirpath)18             open(dirpath+r/+url[-26:],w).write(content)19             20 21 def parseTarget(url):22     root_url=url23     urls=[]24     ##这里得到的东西是每一篇文章的链接25     content=urllib.urlopen(root_url).read()26     27     28     pattern=r<a title="(.*?)" href="http://www.mamicode.com/(.*?)">29     30     hrefs=re.findall(pattern,content)31     32     for href in hrefs:33         #print href34         urls.append(href[1])35 36     return urls37 38 def thread_or_process_job(n,thread_or_process,url_lists,job):39     local_time=time.time()40     ##args为前面函数的参数41     Thread_or_Process=[thread_or_process(target=job,args=(url_lists[i],str(n)+thread_or_process.__name__)) for i in xrange(n)]42         43 44     for t in Thread_or_Process:45         t.start()46         47     for t in Thread_or_Process:48         t.join()49         50     print n,thread_or_process.__name__," run job need ",time.time()-local_time51 52 if __name__==__main__:53     t=time.time()54     urls=[]55     for i in xrange(5):56         urls.extend(parseTarget(http://blog.sina.com.cn/s/articlelist_1191258123_0_+str(i+1)+.html))57     url_len=len(urls)58     print urls_len:,len(urls)59     60     for n in[2,4,6,8]:61         url_list=[]62         url_split_len=url_len//n63         ##将总的url进行分为多少段同时进行处理64         for i in xrange(n):65             if i==n-1:66                 url_list.append(urls[i*url_split_len:url_len])67             else:68                 url_list.append(urls[i*url_split_len:(i+1)*url_split_len])69             70             thread_or_process_job(n,Thread,url_list,downloadURL)71             thread_or_process_job(n,Process,url_list,downloadURL)72     73     print "All done in ",time.time()-t74     

 

爬虫实例(二):多线程,多进程对网页的爬取