easy install pip
pip install scrapy
2. Items, spider, itempipline 基本概念
scrapy startproject tutorial
vim items.py
class DmozItem(Item): # define the fields for your item here like: name = Field() title = Field() link = Field() desc = Field() class TorrentItem(Item): url = Field() name = Field() description = Field() size = Field()
vim spiders/dmoz_spider.py
from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from tutorial.items import DmozItem class DmozSpider(BaseSpider): name = 'dmoz' allowed_domains = ['dmoz.org'] start_urls = ["http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" ] def parse(self, response): sel = HtmlXPathSelector(response) sites = sel.select('//ul/li') items = [] for site in sites: item = DmozItem() item['title'] = site.select('a/text()').extract() item['link'] = site.select('a/@href').extract() item['desc'] = site.select('text()').extract() items.append(item) return items
scrapy crawl dmoz -o items.json -t json
3常见问题
a. spider的name不能与项目名字相同