首页 > 代码库 > 爬虫程序
爬虫程序
下面是一个简单的爬虫程序。
?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python from sys import argv from os import makedirs, unlink, sep from os.path import dirname, exists, isdir, splitext from string import replace, find, lower #from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse, urljoin from formatter import DumbWriter, AbstractFormatter from cStringIO import StringIO from HTMLParser import HTMLParser<br> ‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做,python会默认用ascii编码方式去解析,那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为,sys在默认导入的时候通常会删掉setdefaultencoding这个函数,所以需要用reload加载一下‘‘‘ import sys reload (sys) sys.setdefaultencoding( ‘utf8‘ ) class RetrieveURL(HTMLParser): #我们用HTMLParser新生成了一个类 def __init__( self ): HTMLParser.__init__( self ) self .anchorlist = [] #重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist def handle_starttag( self , tag, attrs): #重写handle_starttag函数,让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中 if tag = = ‘a‘ or tag = = ‘A‘ : for t in attrs : if t[ 0 ] = = ‘href‘ or t[ 0 ] = = ‘HREF‘ : self .anchorlist.append(t[ 1 ]) class Retriever( object ): # download Web pages def __init__( self , url): self .url = url self . file = self .filename(url) def filename( self , url, deffile = ‘index.htm‘ ): parsedurl = urlparse(url, ‘http:‘ , 0 ) ## parse path path = parsedurl[ 1 ] + parsedurl[ 2 ] ext = splitext(path) if ext[ 1 ] = = ‘‘: # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1) if path[ - 1 ] = = ‘/‘ : path + = deffile else : path + = ‘/‘ + deffile ldir = dirname(path) # local directory if sep ! = ‘/‘ : # os-indep. path separator ldir = replace(ldir, ‘/‘ , sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) print ‘ldir is ‘ ,ldir makedirs(ldir) return path def download( self ): # download Web page try : retval = urlretrieve( self .url, self . file ) except IOError: retval = ( ‘*** ERROR: invalid URL "%s"‘ % self .url,) return retval return retval ‘‘‘def parseAndGetLinks(self):# parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist‘‘‘ def parseAndGetLinks( self ): self .parser = RetrieveURL() self .parser.feed( open ( self . file ).read()) self .parser.close() return self .parser.anchorlist class Crawler( object ): # manage entire crawling process count = 0 # static downloaded page counter def __init__( self , url): self .q = [url] self .seen = [] self .dom = urlparse(url)[ 1 ] def getPage( self , url): r = Retriever(url) retval = r.download() if retval[ 0 ] = = ‘*‘ : # error situation, do not parse print retval, ‘... skipping parse‘ return Crawler.count + = 1 print ‘\n(‘ , Crawler.count, ‘)‘ print ‘URL:‘ , url print ‘FILE:‘ , retval[ 0 ] self .seen.append(url) links = r.parseAndGetLinks() # get and process links for eachLink in links: if eachLink[: 4 ] ! = ‘http‘ and find(eachLink, ‘://‘ ) = = - 1 : eachLink = urljoin(url, eachLink) print ‘* ‘ , eachLink, if find(lower(eachLink), ‘mailto:‘ ) ! = - 1 : print ‘... discarded, mailto link‘ continue if eachLink not in self .seen: if find(eachLink, self .dom) = = - 1 : print ‘... discarded, not in domain‘ else : if eachLink not in self .q: self .q.append(eachLink) print ‘... new, added to Q‘ else : print ‘... discarded, already in Q‘ else : print ‘... discarded, already processed‘ def go( self ): # process links in queue while self .q: url = self .q.pop() self .getPage(url) def main(): if len (argv) > 1 : url = argv[ 1 ] else : try : url = raw_input ( ‘Enter starting URL: ‘ ) except (KeyboardInterrupt, EOFError): url = ‘‘ if not url: return robot = Crawler(url) robot.go() if __name__ = = ‘__main__‘ : main() |
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。