python3_scrapy crawls the data packets of Douyu mobile phone (photo of the anchor of the Yan value module)

1. Capture of data packets on the mobile phone (you can use Baidu for the response tutorial)


2. Specific programming

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy


class DouyuItem(scrapy.Item):
    # define the fields for your item here like:
    # 1. The forum where the anchor is located
    game_name = scrapy.Field()

    # 2. Host room number
    room_id = scrapy.Field()

    # 3. Host link
    vertical_src = scrapy.Field()

    # 4. Host nickname
    nickname = scrapy.Field()

    # Store photo name information 5.
    message = scrapy.Field()

douyu.py

# -*- coding: utf-8 -*-
import scrapy
import json
from Douyu.items import DouyuItem


class DouyuSpider(scrapy.Spider):
    name = 'douyu'
    allowed_domains = ['douyucdn.cn']

    # 0. Grab the data packets of the face value classification in the Douyu mobile app
    baseURL = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
    offset = 0
    start_urls = [baseURL+str(offset)]

    def parse(self, response):
        # 1. Decode the encoded json string into a python object (the Python object here is a dictionary)
        # 1. The value corresponding to the data key is a list, and each element in the list is a dictionary
        data_list = json.loads(response.body)["data"]

        # 2. Determine whether the data is empty (terminate if it is empty, and continue to page if it is not empty)
        if not data_list:
            return

        for data in data_list:
            item = DouyuItem()
            item["game_name"] = data["game_name"]
            item["room_id"] = data["room_id"]
            item["vertical_src"] = data["vertical_src"]
            item["nickname"] = data["nickname"]
            item["message"] = data["game_name"]+data["room_id"]+data["nickname"]

            # 3. Return the item object to the pipeline for processing
            yield item

        # 4. JSON files can only be crawled by flipping pages and cannot be extracted by xpath()
        self.offset += 20
        # 5. Send the request back to the engine, the engine returns it to the scheduler, and the scheduler queues the request
        yield scrapy.Request(self.baseURL+str(self.offset), callback=self.parse)

pipelines.py

# -*- coding: utf-8 -*-
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 1. Import the ImagesPipeline class in the images.py file in the pipelines folder of the scrapy package
import them
import scrapy
from Douyu.settings import IMAGES_STORE as images_store
from scrapy.pipelines.images import ImagesPipeline


class DouyuPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        """Picture download"""
        image_link = item["vertical_src"]
        # 2. The pipeline passes the image link to the downloader to download
        yield scrapy.Request(image_link)

    def item_completed(self, results, item, info):
        """Picture rename"""
        # 3. Take out the picture information in the results: the storage path of the picture
        # results is a list, a tuple in the list, two elements in the tuple, and the second element is again a dictionary/md5 code
        # Deductive writing value (the outermost list is equivalent to the results list)
        img_path = [x["path"] for ok, x in results if ok]

        # 4. Rename the downloaded image
        # 4.os.rename(src_name, dst_name)
        os.rename(images_store+img_path[0], images_store + item["message"]+".jpg")

        return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for Douyu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'Douyu'

SPIDER_MODULES = ['Douyu.spiders']
NEWSPIDER_MODULE = 'Douyu.spiders'


# 1. Set the path images_store to save the downloaded images
# 1. Another path method IMAGES_STORE = "E:\\Python\\course code\\Douyu\\Douyu\\spiders\\"
IMAGES_STORE = "E:/Python/course code/Douyu/Douyu/spiders/"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 2. Configure the mobile terminal user_agent
USER_AGENT = 'Mozilla/5.0 (iPhone 84; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.8.0 Mobile/14G60 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1'

# Obey robots.txt rules
# 3. Disable the robots protocol (set to False or comment out directly)
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'Douyu.middlewares.DouyuSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'Douyu.middlewares.DouyuDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 4. Enable pipeline file
ITEM_PIPELINES = {
   'Douyu.pipelines.DouyuPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

start.py

# -*- coding:utf-8 -*-
from scrapy import cmdline


cmdline.execute("scrapy crawl douyu".split())

3. Image crawling result display



Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325579382&siteId=291194637