scrapy crawling webmaster material

1. Create a project scrapy startproject reptiles project name 

2, create bug scrapy genspider insect name

3, setting which add UA camouflage

4, plus LOG_LEVEL level, ROBOTSTXT_OBEY = False

5, insects name inside crawling websites and parse the data

6, item which increases climb fetched data

7, setting pipeline configuration which add

8, pipelines custom conduit from scrapy.pipelines.images import ImagesPipeline, based on the media resource transmission request, file storage function Note: Split ( '/') [-. 1]

. 9, arranged Setting file specifies IMAGES_STORE = './imgLibs' image storage folder name + path

10, setting the inside open conduit ITEM_PIPELINES

. 11, executed insects scrapy crawl img

img.py

# -*- coding: utf-8 -*-
import os
import scrapy
from imgPro1.items import Imgpro1Item


class ImgSpider(scrapy.Spider):
name = 'img'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://sc.chinaz.com/tupian/meinvtupian.html']

def parse(self, response):
div_list = response.xpath('//*[@id="container"]/div')
for div in div_list:
img_src = div.xpath('./div/a/img/@src2').extract_first()
# 实例化item对象:
item = Imgpro1Item()
item['img_src'] = img_src
yield item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import os
import scrapy


class Imgpro1Item(scrapy.Item):
# define the fields for your item here like:
img_src = scrapy.Field()

pipelines.py

# - * - Coding: UTF-8 - * - 

# the Define your Item Pipelines here Wallpaper
#
# to the Do not forget your Pipeline to the Add Setting at The ITEM_PIPELINES
# See: https://docs.scrapy.org/en/latest/topics /item-pipeline.html
Import Scrapy
Import OS
from scrapy.pipelines.images Import ImagesPipeline


class Imgpro1Pipeline (ImagesPipeline):
"" "override inherited methods" ""

# is used (data download) of media resource request, item object parameter item is received reptilian submitted
DEF get_media_requests (Self, item, info):
"" "based on media resource transmission request" ""
the yield scrapy.Request (item [ 'img_src'])

DEF file_path (Self, Request, Response = None, info = None):
"" "document storage function" ""
return request.url.split('/')[-1]

item_completed DEF (Self, Results, item, info):
# the next item passes the pipeline to be executed in a class
return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for imgPro1 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os

BOT_NAME = 'imgPro1'

SPIDER_MODULES = ['imgPro1.spiders']
NEWSPIDER_MODULE = 'imgPro1.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'imgPro1 (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'imgPro1.middlewares.Imgpro1SpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'imgPro1.middlewares.Imgpro1DownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'imgPro1.pipelines.Imgpro1Pipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Name # + picture storage folder path
IMAGES_STORE = './imgLibs'

Guess you like

Origin www.cnblogs.com/zhang-da/p/12433295.html