Python爬虫实战:Scrapy爬取B站娱乐直播全部主播图片重命名下载到本地

Python爬虫:B站娱乐直播封面图、关键帧任你爬

一、准备

创建scrapy项目

scrapy startproject BiliBili

cd BiliBili 

scrapy genspider bz "bilibili.com"

创建启动python文件
start.py

from scrapy import cmdline


cmdline.execute("scrapy crawl bz".split())

在这里插入图片描述
修改及补充完整settings.py文件
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

准备完毕,进入分析阶段。

二、代码及网页分析

1. 网页分析
进入b站娱乐分区,因部分模块主播数量不同,所以准备直接爬取全部。
在这里插入图片描述
按F12打开开发者功能工具选择XHR
在这里插入图片描述

逐条查询请求,找到带有数据的请求
在这里插入图片描述

在这里插入图片描述

在网页请求该url得到我们需要的Json数据
在这里插入图片描述

复制Json数据到在线json工具进行格式化校验
在这里插入图片描述
分析Json数据
在这里插入图片描述

在界面往下滑动滚轮,可看到请求的数量增多,往下滑找到第二条带有数据的请求
在这里插入图片描述
分析请求url
在这里插入图片描述

分析数据位置
在这里插入图片描述

2. 代码分析
bz.py

# -*- coding: utf-8 -*-
import scrapy
import json
from BiliBili.items import BilibiliItem

class BzSpider(scrapy.Spider):
    name = 'bz'
    allowed_domains = ['bilibili.com']
    #请求URL
    start_urls = ['https://api.live.bilibili.com/room/v3/area/getRoomList?platform=web&parent_area_id=1&cate_id=0&area_id=0&sort_type=sort_type_152&page=1&page_size=30']
    num = 1

    def parse(self, response):
    	#获取list,然后遍历得到我们需要的数据
        data_list = json.loads(response.text)["data"]["list"]

        for data in data_list:
            uname = data['uname']  #主播名称
            user_cover = data["user_cover"]  #封面url
            system_cover = data["system_cover"]  #关键帧url

            item = BilibiliItem(uname=uname,user_cover=user_cover,system_cover=system_cover)
            yield item

		#请求多页数据
        self.num += 1
        url = "https://api.live.bilibili.com/room/v3/area/getRoomList?platform=web&parent_area_id=1&cate_id=0&area_id=0&sort_type=sort_type_152&page=" + str(self.num) + "&page_size=30"
		
		#限制请求页数
        if self.num <= 4:
            yield scrapy.Request(url=url,callback=self.parse)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from BiliBili import settings
import os

class BilibiliPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        uname = item["uname"]
        img_cover = item["user_cover"]
        #发送封面图url请求,带上主播名称
        yield scrapy.Request(img_cover,meta={"uname":uname})

        img_crux = item['system_cover']
        #发送关键帧url请求,带上主播名称
        yield scrapy.Request(img_crux,meta={"uname":uname})
		
		#之后发送的请求顺序为
		#主播1封面图url
		#主播1关键帧url
		#主播2封面图url
		#主播2关键帧url

    def file_path(self, request, response=None, info=None):
    	#截取图片名称
        file_name = request.url.split('/')[-1]
        
        #根据测试有些图片名称格式为 xxx.jpg?xxxx,我们需要对此类图片名称也进行修改
        file_name = file_name.split("?")[0]
        
		#得到传过来的主播名称
        category = request.meta['uname']
		#得到settings文件中设置的保存路径
        images_store = settings.IMAGES_STORE
        
        #路径拼接
        category_path = os.path.join(images_store,category)

		#我们的下载图片保存会在默认路径下创建主播名称的文件夹,在其中放入图片
		#根据发送请求的顺序,我们根据有无相同主播名称的文件夹进行判断区分封面图url和关键帧url
        if not os.path.exists(category_path):
        	#将图片拼接到文件夹下
            image_name = os.path.join(category, file_name)
            #截取到图片名称并进行替换
            name = image_name.split("\\")[1].split(".")[0]
            image_name = image_name.replace(name,"封面图")
            #返回封面图图片名称
            return image_name
        else:
            image_name02 = os.path.join(category, file_name)
             #截取到图片名称并进行替换
            name1 = image_name02.split("\\")[1].split(".")[0]
            image_name02 = image_name02.replace(name1, "关键帧")
            #返回关键帧图片名称
            return image_name02

爬取的结果为:
在这里插入图片描述

三、完整代码

bz.py

# -*- coding: utf-8 -*-
import scrapy
import json
from BiliBili.items import BilibiliItem
import requests


class BzSpider(scrapy.Spider):
    name = 'bz'
    allowed_domains = ['bilibili.com']
    start_urls = ['https://api.live.bilibili.com/room/v3/area/getRoomList?platform=web&parent_area_id=1&cate_id=0&area_id=0&sort_type=sort_type_152&page=1&page_size=30']
    num = 1

    def parse(self, response):
        print(response)
        data_list = json.loads(response.text)["data"]["list"]

        for data in data_list:
            uname = data['uname']
            user_cover = data["user_cover"]
            system_cover = data["system_cover"]

            item = BilibiliItem(uname=uname,user_cover=user_cover,system_cover=system_cover)
            yield item
        self.num += 1
        url = "https://api.live.bilibili.com/room/v3/area/getRoomList?platform=web&parent_area_id=1&cate_id=0&area_id=0&sort_type=sort_type_152&page=" + str(self.num) + "&page_size=30"

        if self.num <= 4:
            yield scrapy.Request(url=url,callback=self.parse)


items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BilibiliItem(scrapy.Item):
    uname = scrapy.Field()   #主播名称
    user_cover = scrapy.Field()  #封面图
    system_cover = scrapy.Field()  #关键帧

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from BiliBili import settings
import os

class BilibiliPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        uname = item["uname"]
        img_cover = item["user_cover"]
        yield scrapy.Request(img_cover,meta={"uname":uname})

        img_crux = item['system_cover']
        yield scrapy.Request(img_crux,meta={"uname":uname})

    def file_path(self, request, response=None, info=None):
        file_name = request.url.split('/')[-1]
        file_name = file_name.split("?")[0]
        category = request.meta['uname']
        images_store = settings.IMAGES_STORE
        category_path = os.path.join(images_store,category)
        # print(category_path)
        # print("="*20)
        if not os.path.exists(category_path):
            image_name = os.path.join(category, file_name)
            name = image_name.split("\\")[1].split(".")[0]
            image_name = image_name.replace(name,"封面图")
            return image_name
        else:
            image_name02 = os.path.join(category, file_name)
            name1 = image_name02.split("\\")[1].split(".")[0]
            image_name02 = image_name02.replace(name1, "关键帧")
            return image_name02

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for BiliBili project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'BiliBili'

SPIDER_MODULES = ['BiliBili.spiders']
NEWSPIDER_MODULE = 'BiliBili.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'BiliBili (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

LOG_LEVEL = "ERROR"

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'BiliBili.middlewares.BilibiliSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'BiliBili.middlewares.BiliBili': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'BiliBili.pipelines.BilibiliPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

IMAGES_STORE = "Download"

本次项目到此结束,觉得不错的小伙伴可以点赞关注收藏哦!

关注博主,博主日后会继续发表文章供大家阅读。

博主更多博客

猜你喜欢

转载自blog.csdn.net/llllllkkkkkooooo/article/details/107091134