首页 > 代码库 > scrapy

scrapy

scrapy extract提取出的是list,且selectors返回list.

 

创建项目:scrapy startproject myproject

下载内容并送到标准输出:

scrapy fetch --nolog http://www.example.com/some/page.html

scrapy fetch --nolog --headers http://www.example.com/

用浏览器打开指定的URL

scrapy view <url>

启动shell

scrapy shell [url]

使用spider分给定url

scrapy parse http://www.example.com/ -c parse_item

在运行crawl时添加-a可以传递spider参数

scrapy crawl myspider -a category=electronics

import scrapyclass MySpider(Spider):    name = ‘myspider‘    def __init__(self, category=None, *args, **kwargs):        super(MySpider, self).__init__(*args, **kwargs)        self.start_urls = [‘http://www.example.com/categories/%s‘ % category]

  

rules = (        # 提取匹配 ‘category.php‘ (但不匹配 ‘subsection.php‘) 的链接并跟进链接(没有callback意味着follow默认为True)        Rule(LinkExtractor(allow=(‘category\.php‘, ), deny=(‘subsection\.php‘, ))),        # 提取匹配 ‘item.php‘ 的链接并使用spider的parse_item方法进行分析        Rule(LinkExtractor(allow=(‘item\.php‘, )), callback=‘parse_item‘),    )

  

>>> response.xpath(‘//title/text()‘)[<Selector (text) xpath=//title/text()>]>>> response.css(‘title::text‘)[<Selector (text) xpath=//title/text()>]

  

>>> links = response.xpath(‘//a[contains(@href, "image")]‘)>>> links.extract()[u‘<a href="http://www.mamicode.com/image1.html">Name: My image 1 <br><img src="http://www.mamicode.com/image1_thumb.jpg"></a>‘, u‘<a href="http://www.mamicode.com/image2.html">Name: My image 2 <br><img src="http://www.mamicode.com/image2_thumb.jpg"></a>‘, u‘<a href="http://www.mamicode.com/image3.html">Name: My image 3 <br><img src="http://www.mamicode.com/image3_thumb.jpg"></a>‘, u‘<a href="http://www.mamicode.com/image4.html">Name: My image 4 <br><img src="http://www.mamicode.com/image4_thumb.jpg"></a>‘, u‘<a href="http://www.mamicode.com/image5.html">Name: My image 5 <br><img src="http://www.mamicode.com/image5_thumb.jpg"></a>‘]>>> for index, link in enumerate(links):        args = (index, link.xpath(‘@href‘).extract(), link.xpath(‘img/@src‘).extract())        print ‘Link number %d points to url %s and image %s‘ % argsLink number 0 points to url [u‘image1.html‘] and image [u‘image1_thumb.jpg‘]Link number 1 points to url [u‘image2.html‘] and image [u‘image2_thumb.jpg‘]Link number 2 points to url [u‘image3.html‘] and image [u‘image3_thumb.jpg‘]Link number 3 points to url [u‘image4.html‘] and image [u‘image4_thumb.jpg‘]Link number 4 points to url [u‘image5.html‘] and image [u‘image5_thumb.jpg‘]

  

>>> response.xpath(‘//a[contains(@href, "image")]/text()‘).re(r‘Name:\s*(.*)‘)[u‘My image 1‘, u‘My image 2‘, u‘My image 3‘, u‘My image 4‘, u‘My image 5‘]

divs = response.xpath(‘//div‘)

提取出divs后应该用for p in divs.xpath(‘.//p‘)而不是for p in divs.xpath(‘//p‘),起始为 / 的XPath,那么该XPath将对文档使用绝对路径.

如果p是直系亲属的话用divs.xpath(‘p‘).

>>> from scrapy import Selector>>> doc = """... <div>...     <ul>...         <li class="item-0"><a href="http://www.mamicode.com/link1.html">first item</a></li>...         <li class="item-1"><a href="http://www.mamicode.com/link2.html">second item</a></li>...         <li class="item-inactive"><a href="http://www.mamicode.com/link3.html">third item</a></li>...         <li class="item-1"><a href="http://www.mamicode.com/link4.html">fourth item</a></li>...         <li class="item-0"><a href="http://www.mamicode.com/link5.html">fifth item</a></li>...     </ul>... </div>... """>>> sel = Selector(text=doc, type="html")>>> sel.xpath(//li//@href).extract()[ulink1.html, ulink2.html, ulink3.html, ulink4.html, ulink5.html]>>> sel.xpath(//li[re:test(@class, "item-\d$")]//@href).extract()[ulink1.html, ulink2.html, ulink4.html, ulink5.html]>>>
for scope in sel.xpath(‘//div[@itemscope]‘):...     print "current scope:", scope.xpath(‘@itemtype‘).extract()...     props = scope.xpath(‘‘‘...                 set:difference(./descendant::*/@itemprop,...                                .//*[@itemscope]/*/@itemprop)‘‘‘)...     print "    properties:", props.extract()...     print

  

scrapy