首页 > 代码库 > 爬虫程序

爬虫程序

下面是一个简单的爬虫程序。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
 
from sys import argv
from os import makedirs, unlink, sep
from os.path import dirname, exists, isdir, splitext
from string import replace, find, lower
#from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse, urljoin
from formatter import DumbWriter, AbstractFormatter
from cStringIO import StringIO
from HTMLParser import HTMLParser<br>‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做,python会默认用ascii编码方式去解析,那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为,sys在默认导入的时候通常会删掉setdefaultencoding这个函数,所以需要用reload加载一下‘‘‘
import sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
 
 
class RetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类
    def __init__(self):
        HTMLParser.__init__(self)
        self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist
    def handle_starttag(self, tag, attrs):#重写handle_starttag函数,让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中
            if tag==‘a‘ or tag==‘A‘:
                for t in attrs :
                    if t[0] == ‘href‘ or t[0]==‘HREF‘:
                        self.anchorlist.append(t[1])
 
class Retriever(object):# download Web pages
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)
     
    def filename(self, url, deffile=‘index.htm‘):
        parsedurl = urlparse(url, ‘http:‘, 0) ## parse path
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == ‘‘:    # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1)
            if path[-1] == ‘/‘:
                path += deffile
            else:
                path += ‘/‘ + deffile
        ldir = dirname(path)    # local directory
        if sep != ‘/‘# os-indep. path separator
            ldir = replace(ldir, ‘/‘, sep)
        if not isdir(ldir): # create archive dir if nec.
            if exists(ldir): unlink(ldir)
            print ‘ldir is ‘,ldir
            makedirs(ldir)
        return path
         
     
    def download(self): # download Web page
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = (‘*** ERROR: invalid URL "%s"‘ %self.url,)
            return retval
        return retval
 
    ‘‘‘def parseAndGetLinks(self):# parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist‘‘‘
    def parseAndGetLinks(self):
        self.parser=RetrieveURL()
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
 
 
class Crawler(object):# manage entire crawling process
    count = 0   # static downloaded page counter
    def __init__(self, url):
        self.q = [url]
        self.seen = []
        self.dom = urlparse(url)[1]
 
    def getPage(self, url):
        r = Retriever(url)
        retval = r.download()
         
        if retval[0] == ‘*‘: # error situation, do not parse
            print retval, ‘... skipping parse‘
            return
 
        Crawler.count += 1
        print ‘\n(‘, Crawler.count, ‘)‘
        print ‘URL:‘, url
        print ‘FILE:‘, retval[0]
        self.seen.append(url)
         
        links = r.parseAndGetLinks() # get and process links
        for eachLink in links:
            if eachLink[:4] != ‘http‘ and find(eachLink, ‘://‘) == -1:
                eachLink = urljoin(url, eachLink)
                print ‘* ‘, eachLink,
            if find(lower(eachLink), ‘mailto:‘) != -1:
                print ‘... discarded, mailto link‘
                continue
            if eachLink not in self.seen:
                if find(eachLink, self.dom) == -1:
                    print ‘... discarded, not in domain‘
                else:
                    if eachLink not in self.q:
                        self.q.append(eachLink)
                        print ‘... new, added to Q‘
                    else:
                        print ‘... discarded, already in Q‘
            else:
                print ‘... discarded, already processed‘
                         
    def go(self):# process links in queue
        while self.q:
            url = self.q.pop()
            self.getPage(url)
 
def main():
    if len(argv) > 1:
        url = argv[1]
    else:
        try:
            url = raw_input(‘Enter starting URL: ‘)
        except (KeyboardInterrupt, EOFError):
            url = ‘‘
     
    if not url: return
    robot = Crawler(url)
    robot.go()
 
if __name__ == ‘__main__‘:
    main()