Preface
Collect movie picture material for the public account
Use scrapy image download
Target website
https://film-grab.com/
Crawling results
Because there is already a basis for a successful crawl, and the structure of this website is similar to the previous example, the goal is quickly achieved.
How fast is it in
less than ten minutes
Code reference
settings.py (redundant parts have been removed)
BOT_NAME = 'pictures'
SPIDER_MODULES = ['pictures.spiders']
NEWSPIDER_MODULE = 'pictures.spiders'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
import os
ITEM_PIPELINES = {
#'imagedownload.pipelines.ImagedownloadPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
picture.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders.crawl import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from ..items import PicturesItem
class PictureSpider(CrawlSpider):
name = 'picture'
allowed_domains = ['film-grab.com']
# start_urls = ['http://zcool.com.cn/']
start_urls = ['https://film-grab.com/page/1/']
rules = (
# 翻页的url
Rule(LinkExtractor(allow=r".+film-grab.com/page/\d+"),follow=True),
# 详情页面的url
Rule(LinkExtractor(allow=r"https://film-grab.com/.+"),follow=False,callback="parse_detail")
)
def parse_detail(self, response):
image_urls = response.xpath("//img[@width='1280']/@src").getall()
print(image_urls)
# title_list = response.xpath("//div[@class='details-contitle-box']/h2/text()").getall()
# title = "".join(title_list).strip()
item = PicturesItem(image_urls=image_urls)
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PicturesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py is unchanged by default
summary
scrapy really fragrant