版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_39777626/article/details/81871788
首先创建一个爬虫项目
scrapy startproject yunqiCrawl
cd yunqiCrawl
scrapy genspider -t crawl yunqi.qq.com yunqi.qq.com
yunqi.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from yunqiCrawl.items import YunqiBookListItem,YunqiBookDetailItem
class YunqiSpider(CrawlSpider):
name = 'yunqi'
allowed_domains = ['yunqi.qq.com']
start_urls = ['http://yunqi.qq.com/bk/so2/n10p1']
rules = (
Rule(LinkExtractor(allow=r'/bk/so2/n10p\d+'), callback='parse_book_list', follow=True),
)
def parse_book_list(self, response):
books=response.xpath('//*[@id="detailedBookList"]/div')
for book in books:
novelImageUrl = book.xpath('./a/img/@src').extract_first()
novelId = book.xpath('./div/h3/a/@id').extract_first()
novelLink = book.xpath('./div/h3/a/@href').extract_first()
novelName = book.xpath('./div/h3/a/text()').extract_first()
novelAuthor = book.xpath('./div/dl[1]/dd[1]/a/text()').extract_first()
novelType = book.xpath('./div/dl[1]/dd[2]/a/text()').extract_first()
novelStatus = book.xpath('./div/dl[1]/dd[3]/text()').extract_first()
novelUpdateTime = book.xpath('./div/dl[2]/dd[1]/text()').extract_first()
novelWords = book.xpath('./div/dl[2]/dd[2]/text()').extract_first()
bookListItem=YunqiBookListItem(
novelImageUrl=novelImageUrl,
novelId= novelId,
novelLink= novelLink,
novelName= novelName,
novelAuthor= novelAuthor,
novelType= novelType,
novelStatus= novelStatus,
novelUpdateTime= novelUpdateTime,
novelWords=novelWords)
yield bookListItem
yield scrapy.Request(url=novelLink,callback=self.parse_book_detail, meta={'novelId':novelId})
def parse_book_detail(self, response):
novelId=response.meta['novelId']
novelLabel = response.xpath('//*[@class="tags"]/text()').extract_first()
novelAllClick= response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[1]/text()').extract_first()
novelMonthClick = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[1]/text()').extract_first()
novelWeekClick = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[1]/text()').extract_first()
novelAllPopular = response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[2]/text()').extract_first()
novelMonthPopular = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[2]/text()').extract_first()
novelWeekPopular = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[2]/text()').extract_first()
novelCommentNum = response.xpath('//*[@id="novelInfo"]/table/tr[5]/td[2]/text()').extract_first()
novelAllComm = response.xpath('//*[@id="novelInfo"]/table/tr[2]/td[3]/text()').extract_first()
novelMonthComm = response.xpath('//*[@id="novelInfo"]/table/tr[3]/td[3]/text()').extract_first()
novelWeekComm = response.xpath('//*[@id="novelInfo"]/table/tr[4]/td[3]/text()').extract_first()
bookDetailItem=YunqiBookDetailItem(
novelId=novelId,
novelLabel= novelLabel,
novelAllClick= novelAllClick,
novelMonthClick= novelMonthClick,
novelWeekClick= novelWeekClick,
novelAllPopular= novelAllPopular,
novelMonthPopular= novelMonthPopular,
novelWeekPopular= novelWeekPopular,
novelCommentNum= novelCommentNum,
novelAllComm = novelAllComm,
novelMonthComm = novelMonthComm,
novelWeekComm =novelWeekComm)
yield bookDetailItem
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class YunqiBookListItem(scrapy.Item):
# define the fields for your item here like:
novelId = scrapy.Field()
novelName = scrapy.Field()
novelLink = scrapy.Field()
novelAuthor = scrapy.Field()
novelType = scrapy.Field()
novelStatus = scrapy.Field()
novelUpdateTime = scrapy.Field()
novelWords = scrapy.Field()
novelImageUrl = scrapy.Field()
class YunqiBookDetailItem(scrapy.Item):
# define the fields for your item here like:
novelId = scrapy.Field()
novelLabel = scrapy.Field()
novelAllClick = scrapy.Field()
novelMonthClick = scrapy.Field()
novelWeekClick = scrapy.Field()
novelAllPopular = scrapy.Field()
novelMonthPopular = scrapy.Field()
novelWeekPopular = scrapy.Field()
novelCommentNum = scrapy.Field()
novelAllComm = scrapy.Field()
novelMonthComm = scrapy.Field()
novelWeekComm = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import re
from yunqiCrawl.items import YunqiBookListItem,YunqiBookDetailItem
class YunqicrawlPipeline(object):
def __init__(self, mongo_uri, mongo_db,replicaset):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.replicaset = replicaset
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'yunqi'),
replicaset=crawler.settings.get('REPLICASET')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
if isinstance(item,YunqiBookListItem):
self._process_booklist_item(item)
else:
self._process_bookDetail_item(item)
return item
def _process_booklist_item(self,item):
self.db.bookInfo.insert(dict(item))
def _process_bookDetail_item(self,item):
pattern=re.compile('\d+')
item['novelLabel']=item['novelLabel'].strip().replace('\n','')
match=pattern.search(item['novelAllClick'])
item['novelAllClick']=match.group() if match else item['novelAllClick']
match=pattern.search(item['novelMonthClick'])
item['novelMonthClick']=match.group() if match else item['novelMonthClick']
match=pattern.search(item['novelWeekClick'])
item['novelWeekClick']=match.group() if match else item['novelWeekClick']
match=pattern.search(item['novelAllPopular'])
item['novelAllPopular']=match.group() if match else item['novelAllPopular']
match=pattern.search(item['novelMonthPopular'])
item['novelMonthPopular']=match.group() if match else item['novelMonthPopular']
match=pattern.search(item['novelWeekPopular'])
item['novelWeekPopular']=match.group() if match else item['novelWeekPopular']
match=pattern.search(item['novelAllComm'])
item['novelAllComm']=match.group() if match else item['novelAllComm']
match=pattern.search(item['novelMonthComm'])
item['novelMonthComm']=match.group() if match else item['novelMonthComm']
match=pattern.search(item['novelWeekComm'])
item['novelWeekComm']=match.group() if match else item['novelWeekComm']
match=pattern.search(item['novelCommentNum'])
item['novelCommentNum']=match.group() if match else item['novelCommentNum']
self.db.bookhot.insert(dict(item))
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for yunqiCrawl project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'yunqiCrawl'
SPIDER_MODULES = ['yunqiCrawl.spiders']
NEWSPIDER_MODULE = 'yunqiCrawl.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yunqiCrawl.middlewares.YunqicrawlSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'yunqiCrawl.middlewares.RandomUserAgent.RandomUserAgent': 410,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
MONGO_URI='mongodb://127.0.0.1:27017/'
MONGO_DATABASE='yunqi'
ITEM_PIPELINES = {
'yunqiCrawl.pipelines.YunqicrawlPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
#SCHEDULER="scrapy_redis.scheduler.Scheduler"
#SCHEDULER_PERSIST=True
#DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter"
#REDIS_HOST='127.0.0.1'
#REDIS_PORT=6379
最后启动爬虫
scrapy crawl yunqi