用scrapy爬取小说网站，并保存到数据库

spider.py

# -*- coding: utf-8 -*-
import scrapy
from datetime import datetime
from novel.items import NovelItem,ChapterItem
import hashlib
class A17kSpider(scrapy.Spider):
    name = '17k'
    allowed_domains = [
        'all.17k.com',
        'www.17k.com',
    ]
    start_urls = ['http://all.17k.com/lib/book/2_0_0_0_2_0_1_0_1.html']
    def parse(self, response):
        detail = response.css("table tbody tr")
        novel_item = NovelItem() 
        m = hashlib.md5()
        for tab  in detail[1:]:
            novel_name = tab.css('.td3 a::text').extract_first()
            m.update(novel_name.encode("utf-8"))
            uuid_id = m.hexdigest()
            novel_item['uuid'] = uuid_id
            novel_item['novel_name'] = novel_name

            novel_item["author"] = tab.css(".td6 a::text").extract_first()
            novel_item["status"] = tab.css(".td8 em::text").extract_first().strip()
            novel_item["word_number"] = tab.css(".td5::text").extract_first()
            novel_item["lastest_chapter"] = tab.css(".td4 a::text").extract_first()
            novel_item["category"] = tab.css(".td2 a::text").extract_first()
            update_time  = datetime.strptime(tab.css(".td7::text").extract_first(),"%Y-%m-%d %H:%M") 
            novel_item["update_time"] = update_time
            yield novel_item
        urlList = response.css(".jt::attr(href)").re("\d+.html")
        for item in urlList:
            url = "http://www.17k.com/list/%s"%(item)
            yield scrapy.Request(url,callback=self.chapter,meta={'uuid':uuid_id})
        next_url = response.css(".page a:nth-last-child(4)::attr(href)").extract_first()
        if next_url is not None:
            yield response.follow(next_url,callback=self.parse)
    def chapter(self,response):
        uuid_id = response.meta['uuid']
        chapter_url =  response.css(".Volume dd ::attr(href)").extract()
        for index,item_url in enumerate(chapter_url):
            yield response.follow(item_url,callback=self.content,meta={'uuid':uuid_id,'index':index})
    def content(self,response):
        chapter_item = ChapterItem()
        chapter_item['chapter_name'] =  response.css(".readAreaBox.content h1::text").extract_first().strip()
        chapter_item['content'] = response.css(".readAreaBox.content .p").extract_first()
        chapter_item['novel_id'] = response.meta['uuid']
        chapter_item['sequence']= response.meta['index']
        yield chapter_item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
class NovelItem(scrapy.Item):
    uuid = scrapy.Field()
    novel_name = scrapy.Field()
    author = scrapy.Field()
    status = scrapy.Field()
    word_number = scrapy.Field() 
    lastest_chapter = scrapy.Field()
    category = scrapy.Field()
    update_time = scrapy.Field()
    def get_insert_sql(self):
        insert_sql="""
        insert into app01_novelitem(uuid,novel_name,author,status,word_number,lastest_chapter,category,update_time)
        values(%s,%s,%s,%s,%s,%s,%s,%s) 
        """
        params=(self['uuid'],self['novel_name'],
        self['author'],self['status'],
        self['word_number'],self['lastest_chapter'],
        self['category'],self['update_time']
        )
        return insert_sql,params
class ChapterItem(scrapy.Item):
    chapter_name = scrapy.Field()
    content = scrapy.Field()
    novel_id = scrapy.Field() 
    sequence = scrapy.Field()
    def get_insert_sql(self):
        insert_sql="""
        insert into app01_chapteritem(chapter_name,content,novel_id,sequence)
        values(%s,%s,%s,%s) 
        """
        params=(self['chapter_name'],self['content'],self['novel_id'],self['sequence'])
        return insert_sql,params

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
class NovelPipeline(object):
    def process_item(self, item, spider):
        return item
class MysqlTwistedPipeline(object):
    def __init__(self,dbpool):
        self.dbpool=dbpool
    @classmethod
    def from_settings(cls,settings):
        dbprams=dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            password=settings['MYSQL_PASSWORD'],
            charset="utf8",
            use_unicode=True,
        )
        dbpool=adbapi.ConnectionPool("pymysql",**dbprams)
        return cls(dbpool)
    def process_item(self,item,spider):
        query=self.dbpool.runInteraction(self.do_insert, item)
        query.addErrback(self.handle_error,item,spider)
    def handle_error(self,failure,item,spider):
        print(failure)
    def do_insert(self,cursor,item):
        insert_sql,params=item.get_insert_sql()
        print(insert_sql,params)
        cursor.execute(insert_sql,params)

middlewares.py

from fake_useragent import UserAgent 
class RandomUserAgentMiddlerware(object):
    def __init__(self,crawler):
        super(RandomUserAgentMiddlerware,self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE','random')
    @classmethod
    def from_crawler(cls,crawler):
        return cls(crawler)
    def process_request(self,request,spider):
        def get_ua():
            return getattr(self.ua,self.ua_type)
        request.headers.setdefault("User-Agent",get_ua())

settings

# -*- coding: utf-8 -*-

# Scrapy settings for novel project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'novel'

SPIDER_MODULES = ['novel.spiders']
NEWSPIDER_MODULE = 'novel.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'novel (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'novel.middlewares.NovelSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'novel.middlewares.RandomUserAgentMiddlerware': 543,
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'novel.pipelines.MysqlTwistedPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RANDOM_UA_TYPE="random"
MYSQL_HOST='127.0.0.1'
MYSQL_USER='root'
MYSQL_PASSWORD=''
MYSQL_DBNAME='novel'

先写到这里，这儿主要在middleware里面添加了headers，到时候再添加布隆过滤，分布式，代理等技术

用scrapy爬取小说网站，并保存到数据库

猜你喜欢