Python link extractor CrawlSpider

Function: extract this kind of unknown number of pages, the data links are regular,

Advantages: You can use regular expressions, xpath, css, etc. to obtain regular urls for parsing

 

 

Simple demo: 

from scrapy.linkextractors import LinkExtractor
link = LinkExtractor(allow=r'book/1188_\d+.html')  # \d表示数字 \d+:表示多个数字
# xpath:    link1 = LinkExtrcator(restrict_xpaths=r'//div[@id=su]')
print(link.extract_links(response))

actual case

1. Create a project

# 创建项目
scrapy staratproject dushuproject
# 跳转到spiders路径
cd dushuproject/dushuproject/spiders
# 创建爬虫类,项目
scrapy genspider -t crawl read www.dushu.com
# 设置配置 settings    创建实体:items   配置管道piplines

# allow: regular expression matching link url path 
# follow: limit the number of pages, if it is True, there is no limit, and the interface of each cycle will be searched, which is equivalent to finding all matches, and False: it will match the response in the first page match links to resolve

read.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from read_book.items import ReadBookItem


class ReadSpider(CrawlSpider):
    name = 'read'
    allowed_domains = ['www.dushu.com']
    # 注意,缺少主页是因为表达式rules 正则不匹配,
    start_urls = ['https://www.dushu.com/book/1107_1.html']

    rules = (
        # allow:正则表达式 匹配链接url路径
        # follow:限制页数,如果是True就是不限制,每一次循环的界面都找,相当于找到所有匹配的,而False:就匹配第一页response中的匹配链接来解析
        Rule(LinkExtractor(allow=r'/book/1107_\d+.html'), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        item = {}
        # print("++++++++++++++++++")     #//div[@class='bookslist']/ul/li//div//img/@src   data-original
        # url_list = response.xpath("//div[@class='bookslist']/ul/li//div//img/@data-original").extract()
        # name_list = response.xpath("//div[@class='bookslist']/ul/li//div//img/@alt").extract()
        img_list = response.xpath("//div[@class='bookslist']/ul/li//div//img")
        for img in img_list:
            src = img.xpath("./@data-original").extract_first()
            if src is None:
                src = img.xpath("./@src").extract_first()
            name = img.xpath("./@alt").extract_first()
            book = ReadBookItem(name=name,src=src)
            yield book


settings.py open write mysql pipeline configure mysql information

# Scrapy settings for read_book project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'read_book'

SPIDER_MODULES = ['read_book.spiders']
NEWSPIDER_MODULE = 'read_book.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'read_book (+http://www.yourdomain.com)'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'read_book.middlewares.ReadBookSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'read_book.middlewares.ReadBookDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'read_book.pipelines.ReadBookPipeline': 300,
   'read_book.pipelines.MysqlPipline': 301,
}

#Mysql配置
DB_HOST = "127.0.0.1"
DB_PORT = 3306
DB_USER = "root"
DB_PASSWORD = "123456"
DB_CHARSET = "utf8"
DB_NAME = "spider"


# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

pipeline.py    

import pymysql # Import mysql dependency package 
from scrapy.utils.project import get_project_settings #Import can get the configuration in settings.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
from scrapy.utils.project import get_project_settings

class ReadBookPipeline:
    def open_spider(self,spider):
        print("爬虫开始。。。。。。。。。。。。。",spider)
        self.fp = open("book.json","w",encoding="utf-8")

    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item

    def close_spider(self,spider):
        print("爬虫结束。。。。。。。。。。",spider)
        self.fp.close()

# 存入mysql
class MysqlPipline:
    def open_spider(self,spider):
        settings = get_project_settings()
        self.host = user = settings['DB_HOST']
        self.port = user = settings['DB_PORT']
        self.user = user = settings['DB_USER']
        self.password = user = settings['DB_PASSWORD']
        self.name = user = settings['DB_NAME']
        self.charset = user = settings['DB_CHARSET']
        # self.conn = pymysql.connect(host=host, prot=prot, user =user, password=password, charset=charset, database=name)
        self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, charset=self.charset, database=self.name)
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        # Mysql配置
        # DB_HOST = "127.0.0.1"
        # DB_PORT = 3306
        # DB_USER = "root"
        # DB_PASSWORD = "123456"
        # DB_CHARSET = "utf8"
        # DB_NAME = "spider"
        sql = "insert into book(src,name) values ('{}','{}')".format(item['src'],item['name'])
        self.cursor.execute(sql)
        self.conn.commit()
        return item

    def close_spider(self,spider):
        self.conn.close()
        self.cursor.close()

items.py


import scrapy


class ReadBookItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # pass
    # url name
    name = scrapy.Field()
    src = scrapy.Field()

 

Guess you like

Origin blog.csdn.net/weixin_46310452/article/details/126099412