首页 > 代码库 > python核心编程中网络爬虫的例子

python核心编程中网络爬虫的例子

  1 #!/usr/bin/env python  2   3 import cStringIO                    #  4 import formatter                    #  5 from htmllib import HTMLParser         # We use various classes in these modules for parsing HTML.  6 import httplib                        # We only need an exception from this module  7 import os                            # This provides various file system functions  8 import sys                            # We are just using argv for command-line arguments  9 import urllib                        # We only need the urlretrieve()function for downloading Web pages 10 import urlparse                        # We use the urlparse()and urljoin()functions for URL manipulation 11  12 class Retriever(object): 13     __slots__ = (url,file) 14      15     def __init__(self,url): 16         self.url, self.file = self.get_file(url) 17          18     def get_file(self, url, default=index.html): 19         Create usable local filename from URL 20         parsed = urlparse.urlparse(url)                     # ParseResult(scheme=‘http‘, netloc=‘www.baidu.com‘, path=‘‘, params=‘‘, query=‘‘, fragment=‘‘) 21         host = parsed.netloc.split(@)[-1].split(:)[0]    # ‘www.baidu.com‘ 22         filepath = %s%s % (host,parsed.path)                # ‘www.baidu.com‘ 23         if not os.path.splitext(parsed.path)[1]:            # ‘‘ 24             filepath = os.path.join(filepath, default)        # ‘www.baidu.com\\index.html‘ 25         linkdir = os.path.dirname(filepath)                    # ‘www.baidu.com‘ 26         if not os.path.isdir(linkdir):                        # False 27             if os.path.exists(linkdir):                        # False 28                 os.unlink(linkdir)                             29             os.makedirs(linkdir)                            # make a directory named by link directory on the hard disc 30         return url, filepath 31          32     def download(self): 33         Download URL to specific name file 34         try: 35             retval = urllib.urlretrieve(self.url, self.file) 36         except (IOError, httplib.InvalidURL) as e: 37             retval = ((*** ERROR:bad URL "%s": %s % (self.url,e)),) 38         return retval 39          40     def parse_links(self): 41         Parse out the links found in downloaded HTML file 42         f = open(self.file, r) 43         data =http://www.mamicode.com/ f.read() 44         f.close() 45         parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO()))) 46         parser.feed(data) 47         parser.close() 48         return parser.anchorlist 49          50 class Crawler(object): 51     count = 0                                                # the number of objects downloaded from the internet 52      53     def __init__(self, url): 54         self.q = [url]                                        # a queue of links to download 55         self.seen = set()                                    # a set containing all the links that we have seen(downloaded) already 56         parsed = urlparse.urlparse(url) 57         host = parsed.netloc.split(@)[-1].split(:)[0] 58         self.dom = ..join(host.split(.)[-2:])            # ‘b.a.i.d.u‘ 59  60     def get_page(self, url, media=False): 61         Download page & parse links, add to queue if nec 62         r = Retriever(url) 63         fname = r.download()[0]                                # ‘www.baidu.com\\index.html‘ 64         if fname[0] == *:                                    # ‘w‘ 65             print fname, ... skipping parse 66             return 67         Crawler.count += 1                                    # 1 68         print \n(, Crawler.count, )                        # (1) 69         print URL:, url                                    # URL: http://www.baidu.com 70         print FILE:, fname                                # FILE: www.baidu.com\\index.html 71         self.seen.add(url)                                    # set([‘http://www.baidu.com‘]) 72         ftype = os.path.splitext(fname)[1]                    # ‘.html‘ 73         if ftype not in (.htm, .html):                    # False 74             return 75              76         for link in r.parse_links(): 77             if link.startswith(mailto:):                    # False 78                 print ... discarded, mailto link 79                 continue 80             if not media:                                    # False 81                 ftype = os.path.splitext(link)[1] 82                 if ftype in (.mp3,.mp4,.m4v,.wav): 83                     print ... discarded, media file 84                     continue 85             if not link.startswith(http://):                # False 86                 link = urlparse.urljoin(url, link) 87             print *, link, 88             if link not in self.seen:                        # True 89                 if self.dom not in link:                    # False 90                     print ... discarded, not in domain 91                 else: 92                     if link not in self.q: 93                         self.q.append(link) 94                         print ... new, added to Q 95                     else: 96                         print ... discarded, already in Q 97             else: 98                 print ... discarded, already processed 99                 100     def go(self, media=False):101         Process next page in queue (if any)102         while self.q:103             url = self.q.pop()104             self.get_page(url, media)105             106 def main():107         if len(sys.argv) > 1:108             url = sys.argv[1]109         else:110             try:111                 url = raw_input(Enter starting URL:)112             except(KeyboardInterrupt, EOFError):113                 url = ‘‘114         if not url:115             return116         if not url.startswith(http://) and not url.startswith(ftp://):117             url = http://%s/ % url118         robot = Crawler(url)119         robot.go()120         121 if __name__ == __main__:122         main()