Scrapy框架:爬取云起

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_39777626/article/details/81871788

首先创建一个爬虫项目

scrapy startproject yunqiCrawl
cd yunqiCrawl
scrapy genspider -t  crawl yunqi.qq.com yunqi.qq.com

yunqi.py

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from yunqiCrawl.items import YunqiBookListItem,YunqiBookDetailItem



class YunqiSpider(CrawlSpider):

    name = 'yunqi'

    allowed_domains = ['yunqi.qq.com']

    start_urls = ['http://yunqi.qq.com/bk/so2/n10p1']



    rules = (

        Rule(LinkExtractor(allow=r'/bk/so2/n10p\d+'), callback='parse_book_list', follow=True),

    )



    def parse_book_list(self, response):

        books=response.xpath('//*[@id="detailedBookList"]/div')

        for book in books:

            novelImageUrl = book.xpath('./a/img/@src').extract_first()

            novelId = book.xpath('./div/h3/a/@id').extract_first()

            novelLink = book.xpath('./div/h3/a/@href').extract_first()

            novelName = book.xpath('./div/h3/a/text()').extract_first()

            novelAuthor = book.xpath('./div/dl[1]/dd[1]/a/text()').extract_first()

            novelType = book.xpath('./div/dl[1]/dd[2]/a/text()').extract_first()

            novelStatus = book.xpath('./div/dl[1]/dd[3]/text()').extract_first()

            novelUpdateTime = book.xpath('./div/dl[2]/dd[1]/text()').extract_first()

            novelWords = book.xpath('./div/dl[2]/dd[2]/text()').extract_first()

            bookListItem=YunqiBookListItem(

                novelImageUrl=novelImageUrl,

                novelId= novelId,

                novelLink= novelLink,

                novelName= novelName,

                novelAuthor= novelAuthor,

                novelType= novelType,

                novelStatus= novelStatus,

                novelUpdateTime= novelUpdateTime,

                novelWords=novelWords)

            yield bookListItem

            yield scrapy.Request(url=novelLink,callback=self.parse_book_detail, meta={'novelId':novelId})



    def parse_book_detail(self, response):

        novelId=response.meta['novelId']

        novelLabel = response.xpath('//*[@class="tags"]/text()').extract_first()

        novelAllClick= response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[1]/text()').extract_first()

        novelMonthClick = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[1]/text()').extract_first()

        novelWeekClick = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[1]/text()').extract_first()

        novelAllPopular = response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[2]/text()').extract_first()

        novelMonthPopular = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[2]/text()').extract_first()

        novelWeekPopular = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[2]/text()').extract_first()

        novelCommentNum = response.xpath('//*[@id="novelInfo"]/table/tr[5]/td[2]/text()').extract_first()

        novelAllComm = response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[3]/text()').extract_first()

        novelMonthComm = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[3]/text()').extract_first()

        novelWeekComm = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[3]/text()').extract_first()

        bookDetailItem=YunqiBookDetailItem(

            novelId=novelId,

            novelLabel= novelLabel,

            novelAllClick= novelAllClick,

            novelMonthClick= novelMonthClick,

            novelWeekClick= novelWeekClick,

            novelAllPopular= novelAllPopular,

            novelMonthPopular= novelMonthPopular,

            novelWeekPopular= novelWeekPopular,

            novelCommentNum= novelCommentNum,

            novelAllComm = novelAllComm,

            novelMonthComm = novelMonthComm,

            novelWeekComm =novelWeekComm)

        yield bookDetailItem

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class YunqiBookListItem(scrapy.Item):
    # define the fields for your item here like:
    novelId = scrapy.Field()
    novelName = scrapy.Field()
    novelLink = scrapy.Field()
    novelAuthor = scrapy.Field()
    novelType = scrapy.Field()
    novelStatus = scrapy.Field()
    novelUpdateTime = scrapy.Field()
    novelWords = scrapy.Field()
    novelImageUrl = scrapy.Field()


class YunqiBookDetailItem(scrapy.Item):
    # define the fields for your item here like:
    novelId = scrapy.Field()
    novelLabel = scrapy.Field()
    novelAllClick = scrapy.Field()
    novelMonthClick = scrapy.Field()
    novelWeekClick = scrapy.Field()
    novelAllPopular = scrapy.Field()
    novelMonthPopular = scrapy.Field()
    novelWeekPopular = scrapy.Field()
    novelCommentNum = scrapy.Field()
    novelAllComm = scrapy.Field()
    novelMonthComm = scrapy.Field()
    novelWeekComm = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-



# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo

import re

from yunqiCrawl.items import YunqiBookListItem,YunqiBookDetailItem



class YunqicrawlPipeline(object):



    def __init__(self, mongo_uri, mongo_db,replicaset):

        self.mongo_uri = mongo_uri

        self.mongo_db = mongo_db

        self.replicaset = replicaset



    @classmethod

    def from_crawler(cls, crawler):

        return cls(

            mongo_uri=crawler.settings.get('MONGO_URI'),

            mongo_db=crawler.settings.get('MONGO_DATABASE', 'yunqi'),

            replicaset=crawler.settings.get('REPLICASET')

        )



    def open_spider(self, spider):

        self.client = pymongo.MongoClient(self.mongo_uri)

        self.db = self.client[self.mongo_db]



    def close_spider(self, spider):

        self.client.close()



    def process_item(self, item, spider):

        if isinstance(item,YunqiBookListItem):

            self._process_booklist_item(item)

        else:

            self._process_bookDetail_item(item)

        return item



    def _process_booklist_item(self,item):

        self.db.bookInfo.insert(dict(item))



    def _process_bookDetail_item(self,item):

        pattern=re.compile('\d+')

        item['novelLabel']=item['novelLabel'].strip().replace('\n','')



        match=pattern.search(item['novelAllClick'])

        item['novelAllClick']=match.group() if match else item['novelAllClick']



        match=pattern.search(item['novelMonthClick'])

        item['novelMonthClick']=match.group() if match else item['novelMonthClick']



        match=pattern.search(item['novelWeekClick'])

        item['novelWeekClick']=match.group() if match else item['novelWeekClick']



        match=pattern.search(item['novelAllPopular'])

        item['novelAllPopular']=match.group() if match else item['novelAllPopular']



        match=pattern.search(item['novelMonthPopular'])

        item['novelMonthPopular']=match.group() if match else item['novelMonthPopular']



        match=pattern.search(item['novelWeekPopular'])

        item['novelWeekPopular']=match.group() if match else item['novelWeekPopular']



        match=pattern.search(item['novelAllComm'])

        item['novelAllComm']=match.group() if match else item['novelAllComm']



        match=pattern.search(item['novelMonthComm'])

        item['novelMonthComm']=match.group() if match else item['novelMonthComm']



        match=pattern.search(item['novelWeekComm'])

        item['novelWeekComm']=match.group() if match else item['novelWeekComm']



        match=pattern.search(item['novelCommentNum'])

        item['novelCommentNum']=match.group() if match else item['novelCommentNum']



        self.db.bookhot.insert(dict(item))

settings.py

# -*- coding: utf-8 -*-



# Scrapy settings for yunqiCrawl project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html



BOT_NAME = 'yunqiCrawl'



SPIDER_MODULES = ['yunqiCrawl.spiders']

NEWSPIDER_MODULE = 'yunqiCrawl.spiders'





# Crawl responsibly by identifying yourself (and your website) on the user-agent


# Obey robots.txt rules

ROBOTSTXT_OBEY = False



# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32



# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

DOWNLOAD_DELAY = 2

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16



# Disable cookies (enabled by default)

COOKIES_ENABLED = False



# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False



# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}



# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'yunqiCrawl.middlewares.YunqicrawlSpiderMiddleware': 543,

#}



# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,

#    'yunqiCrawl.middlewares.RandomUserAgent.RandomUserAgent': 410,

#}



# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}



# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html



MONGO_URI='mongodb://127.0.0.1:27017/'

MONGO_DATABASE='yunqi'

ITEM_PIPELINES = {

    'yunqiCrawl.pipelines.YunqicrawlPipeline': 300,

}



# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

# The initial download delay

AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False



# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'



#SCHEDULER="scrapy_redis.scheduler.Scheduler"

#SCHEDULER_PERSIST=True

#DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter"

#REDIS_HOST='127.0.0.1'

#REDIS_PORT=6379

最后启动爬虫

scrapy crawl yunqi

猜你喜欢

转载自blog.csdn.net/weixin_39777626/article/details/81871788