爬取阳光问政平台

创建项目

scrapy startproject dongguan

items.py

import scrapy


class DongguanItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    content = scrapy.Field()
    url = scrapy.Field()
    number = scrapy.Field()

创建CrawSpider,使用模版craw

scrapy genspider -t craw sun 'wz.sun0769.com'

sun.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem


class SunSpider(CrawlSpider):
    name = 'sun'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']

    rules = (
        Rule(LinkExtractor(allow=r'type=4&page=\d+')),
        Rule(LinkExtractor(allow=r'/html/question/\d+/\d+.shtml'), callback = 'parse_item'),
    )

    def parse_item(self, response):
        item = DongguanItem()

        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
        # 编号
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]
        # 内容
        item['content'] = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0]
        # 链接
        item['url'] = response.url

        yield item

pipelines.py

import json

class DongguanPipeline(object):
    def __init__(self):
        self.filename = open("dongguan.json", "w")

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
        self.filename.write(text.encode("utf-8"))
        return item

    def close_spider(self, spider):
        self.filename.close()
settings.py
BOT_NAME = 'dongguan'

SPIDER_MODULES = ['dongguan.spiders']
NEWSPIDER_MODULE = 'dongguan.spiders'

ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
    'dongguan.pipelines.DongguanPipeline': 300,
}

LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"
 

 执行

scrapy crawl sun

猜你喜欢

转载自www.cnblogs.com/wanglinjie/p/9211212.html
今日推荐