爬虫程序

首页 > 代码库 > 爬虫程序

2024-07-07 00:32:40 229人阅读

下面是一个简单的爬虫程序。

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

#!/usr/bin/env python

from sys import argv

from os import makedirs, unlink, sep

from os.path import dirname, exists, isdir, splitext

from string import replace, find, lower

#from htmllib import HTMLParser

from urllib import urlretrieve

from urlparse import urlparse, urljoin

from formatter import DumbWriter, AbstractFormatter

from cStringIO import StringIO

from HTMLParser import HTMLParser<br>

‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做，python会默认用ascii编码方式去解析，那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为，sys在默认导入的时候通常会删掉setdefaultencoding这个函数，所以需要用reload加载一下‘‘‘

import sys

reload(sys)

sys.setdefaultencoding(‘utf8‘)

class RetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类

def __init__(self):

HTMLParser.__init__(self)

self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist

def handle_starttag(self, tag, attrs):#重写handle_starttag函数，让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中

if tag==‘a‘ or tag==‘A‘:

for t in attrs :

if t[0] == ‘href‘ or t[0]==‘HREF‘:

self.anchorlist.append(t[1])

class Retriever(object):# download Web pages

def __init__(self, url):

self.url = url

self.file = self.filename(url)

def filename(self, url, deffile=‘index.htm‘):

parsedurl = urlparse(url, ‘http:‘, 0) ## parse path

path = parsedurl[1] + parsedurl[2]

ext = splitext(path)

if ext[1] == ‘‘: # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1)

if path[-1] == ‘/‘:

path += deffile

else:

path += ‘/‘ + deffile

ldir = dirname(path) # local directory

if sep != ‘/‘: # os-indep. path separator

ldir = replace(ldir, ‘/‘, sep)

if not isdir(ldir): # create archive dir if nec.

if exists(ldir): unlink(ldir)

print ‘ldir is ‘,ldir

makedirs(ldir)

return path

def download(self): # download Web page

try:

retval = urlretrieve(self.url, self.file)

except IOError:

retval = (‘*** ERROR: invalid URL "%s"‘ %self.url,)

return retval

‘‘‘def parseAndGetLinks(self):# parse HTML, save links

self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))

self.parser.feed(open(self.file).read())

self.parser.close()

return self.parser.anchorlist‘‘‘

def parseAndGetLinks(self):

self.parser=RetrieveURL()

self.parser.feed(open(self.file).read())

self.parser.close()

return self.parser.anchorlist

class Crawler(object):# manage entire crawling process

count = 0 # static downloaded page counter

def __init__(self, url):

self.q = [url]

self.seen = []

self.dom = urlparse(url)[1]

def getPage(self, url):

r = Retriever(url)

retval = r.download()

if retval[0] == ‘*‘: # error situation, do not parse

print retval, ‘... skipping parse‘

return

Crawler.count += 1

print ‘\n(‘, Crawler.count, ‘)‘

print ‘URL:‘, url

print ‘FILE:‘, retval[0]

self.seen.append(url)

links = r.parseAndGetLinks() # get and process links

for eachLink in links:

if eachLink[:4] != ‘http‘ and find(eachLink, ‘://‘) == -1:

eachLink = urljoin(url, eachLink)

print ‘* ‘, eachLink,

if find(lower(eachLink), ‘mailto:‘) != -1:

print ‘... discarded, mailto link‘

continue

if eachLink not in self.seen:

if find(eachLink, self.dom) == -1:

print ‘... discarded, not in domain‘

else:

if eachLink not in self.q:

self.q.append(eachLink)

print ‘... new, added to Q‘

else:

print ‘... discarded, already in Q‘

else:

print ‘... discarded, already processed‘

def go(self):# process links in queue

while self.q:

url = self.q.pop()

self.getPage(url)

def main():

if len(argv) > 1:

url = argv[1]

else:

try:

url = raw_input(‘Enter starting URL: ‘)

except (KeyboardInterrupt, EOFError):

url = ‘‘

if not url: return

robot = Crawler(url)

robot.go()

if __name__ == ‘__main__‘:

main()

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 爬虫程序

爬虫程序

看完仍有疑问？有类似问题直接问程序猿