创建项目
scrapy startproject dongguan
items.py
import scrapy class DongguanItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content = scrapy.Field() url = scrapy.Field() number = scrapy.Field()
创建CrawSpider,使用模版craw
scrapy genspider -t craw sun 'wz.sun0769.com'
sun.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from dongguan.items import DongguanItem class SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0'] rules = ( Rule(LinkExtractor(allow=r'type=4&page=\d+')), Rule(LinkExtractor(allow=r'/html/question/\d+/\d+.shtml'), callback = 'parse_item'), ) def parse_item(self, response): item = DongguanItem() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 内容 item['content'] = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0] # 链接 item['url'] = response.url yield item
pipelines.py
import json class DongguanPipeline(object): def __init__(self): self.filename = open("dongguan.json", "w") def process_item(self, item, spider): text = json.dumps(dict(item), ensure_ascii = False) + ",\n" self.filename.write(text.encode("utf-8")) return item def close_spider(self, spider): self.filename.close()
settings.py
BOT_NAME = 'dongguan' SPIDER_MODULES = ['dongguan.spiders'] NEWSPIDER_MODULE = 'dongguan.spiders' ROBOTSTXT_OBEY = True ITEM_PIPELINES = { 'dongguan.pipelines.DongguanPipeline': 300, } LOG_FILE = "dg.log" LOG_LEVEL = "DEBUG"
执行
scrapy crawl sun