【2020-1-28更新(多进程版本)】Python爬虫爬取Blibili视频

即上次写完整个爬虫程序到现在已有3天左右的时间了,今天拿出上一次的爬虫程序接着跑,就跑不下来了,不得不说,B站更新实在是快。感觉很可惜,故此修改了原本的程序,可以再次抓取b站视频了。由于网速等众多原因,导致爬取速度极其缓慢,故此,另外加上了 多进程 来提高程序的运行速度,另外加上了注释。本程序唯一的一个不好的地方在于,加上多进程以后,进度条显示错乱,对于这个问题,我也没有深入去解决,所以,原谅本程序的缺点。(本程序只能抓取改版以后的 .m4s 格式文件的视频 ---------> 大概是18年7月份以后的视频(没做具体查询))

程序运行结束后的截图
(这里用“韩女团饭拍视频库”实验的,因为相比较而言,b站高产这样的视频,所以抓个象征性的视频,均为高清原画质):在这里插入图片描述
程序源码如下:

import requests
import re
import time
import os
from retrying import retry
from multiprocessing import Pool


class BiliBili_Spider:
    def __init__(self, keyword, page, audio_condition):
        self.audio_condition = audio_condition
        self.page = page
        self.num = 0
        self.index_url = "https://search.bilibili.com/all?keyword={}&from_source=nav_search&page={}".format(keyword,
                                                                                                            page)
        self.index_headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
        }
        self.seconed_headers = {
            "Accept-Encoding": "identity",
            "Accept": "*/*",
            "Connection": "keep-alive",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Sec-Fetch-Site": "cross-site",
            "Sec-Fetch-Mode": "cors",
            "Origin": "https://www.bilibili.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
        self.File_name = "b站_{}_视频".format(keyword, page)
        if not os.path.exists(self.File_name):
            os.makedirs(self.File_name)

    @retry(stop_max_attempt_number=2)
    def get_request(self, url, headers, stream=False):
        res = requests.get(url, headers=headers, stream=stream)
        return res

    @retry(stop_max_attempt_number=2)
    def write_data(self, file_name, m4s_bytes, chunk_size, content_size):
        size = 0
        if os.path.exists(self.File_name + "/" + file_name):
            print("遇到重复视频,已自动选择跳过此下载")
            print(file_name)
        else:
            print("正在下载 {}:".format(file_name))
            with open(self.File_name + "/" + file_name, "wb") as f:
                for data in m4s_bytes.iter_content(chunk_size=chunk_size):
                    f.write(data)
                    size = len(data) + size
                    print('\r' + int(size / content_size * 100) * "█" + " 【" + str(
                        round(size / chunk_size / 1024, 2)) + "MB】" + "【" + str(
                        round(float(size / content_size) * 100, 2)) + "%" + "】", end="")

    def for_run_url(self, seconed_url_w):
        seconed_url = seconed_url_w[0]
        file_name = seconed_url_w[1]
        '''请求 seconed_url 获取页面,对页面进行处理'''
        html_str02 = self.get_request("http:" + seconed_url, self.index_headers).content.decode("utf-8")
        try:
            '''匹配 m4s_30080 视频文件的url'''
            m4s_30080 = re.findall(r'''"baseUrl":"(.*?)"''', html_str02, re.S)[0]
        except Exception as e:
            print(e)
            self.num = self.num + 1
            raise Exception(print("continue"))

        '''匹配 mp3_30216 音频文件的 url'''
        if self.audio_condition == 'Y':
            mp3_30216 = re.findall(r'''"baseUrl":"(.*?)"''', html_str02, re.S)[-2]

        '''上一步拿到 视频 音频 文件的url之后,需要构造 (试探total) 的请求头请求数据'''
        Referer_key = seconed_url
        # 试探请求头大小
        Range_key = 'bytes=0-5'
        self.seconed_headers['Referer'] = 'https://' + Referer_key
        self.seconed_headers['Range'] = Range_key
        '''获取请求头中的total'''
        html_bytes = self.get_request(m4s_30080, headers=self.seconed_headers).headers['Content-Range']
        if self.audio_condition == 'Y':
            audio_bytes = self.get_request(mp3_30216, headers=self.seconed_headers).headers['Content-Range']
        '''正则匹配 total 的值'''
        total = re.findall(r"/(.*)", html_bytes, re.S)[0]
        if self.audio_condition == 'Y':
            audio_total = re.findall(r"/(.*)", audio_bytes, re.S)[0]

        '''上一步通过试探获取到了total,这里就构造请求头请求全部的视音频数据'''
        self.seconed_headers['Range'] = total
        '''进度条效果参数 stream chunk_size content_size'''
        stream = True
        chunk_size = 1024  # 每次块大小为1024
        content_size = int(total)
        if self.audio_condition == 'Y':
            content_size_audio = int(audio_total)
            print("文件大小:" + str(round(float((content_size + content_size_audio) / chunk_size / 1024), 4)) + "[MB]")
        else:
            print("文件大小:" + str(round(float(content_size / chunk_size / 1024), 4)) + "[MB]")
        start = time.time()
        try:
            m4s_bytes = self.get_request(m4s_30080, headers=self.seconed_headers, stream=stream)
            self.write_data(file_name + ".mp4", m4s_bytes, chunk_size, content_size)
            if self.audio_condition == 'Y':
                print("\n")
                self.seconed_headers['Range'] = audio_total
                mp3_bytes = self.get_request(mp3_30216, headers=self.seconed_headers, stream=stream)
                self.write_data(file_name + ".mp3", mp3_bytes, chunk_size, content_size_audio)
        except Exception as e:
            print(e)
            self.num = self.num + 1
            raise Exception(print("continue"))
        end = time.time()
        print("总耗时:" + str(end - start) + "秒")
        self.num = self.num + 1

    def run(self):
        '''获取开始爬取的原始页面'''
        # print("第一次请求开始。。。。。")
        html_str01 = self.get_request(self.index_url, self.index_headers).content.decode("utf-8")
        # print(html_str01)

        '''本步骤开始获取待爬取的视频的 跳转链接 以及 名字'''
        avi_href_name_list = re.findall(
            r'''<li class="video-item matrix"><a href="(.*?)" title="(.*?)" target="_blank" class="img-anchor">''',
            html_str01)
        # print(avi_href_name_list)

        '''此步爬虫失效,由Blibili更新引起,2020-1-8 重写爬虫'''

        '''本步防止出现正则匹配到最前面开头部分的名字导致请求失败'''
        if len(avi_href_name_list) != 20:
            count = len(avi_href_name_list) - 20
            for i in range(count):
                del avi_href_name_list[i]
        ''''''

        '''本步骤开始遍历 avi_href_name_list ,分别取出 url 与 name'''
        '''B站改版以后全面限速,考虑网速与下载速度原因,加上多进程'''
        pool = Pool(processes=20)
        for seconed_url_w in avi_href_name_list:
            pool.apply_async(self.for_run_url, args=(seconed_url_w,))
        print("-" * 100)
        pool.close()
        pool.join()
        print("*" * 100)
        print("视频下载完成")


if __name__ == '__main__':
    # page = 1
    while True:
        keyword = input("请输入需要爬取的视频的名称:\n")
        page = input("请输入需要爬取的视频的页码(只能为数字):\n")
        try:
            int(page)
        except:
            print("输入的字符不为数字,请重试!!!\n\n")
            continue
        audio_condition = input("是否需要音频一同下载('视音频分离下载,速度原因,请使用格式工厂合并视音频'):(请输入单个字符:'Y' or 'N')\n")
        if not audio_condition in ('Y', 'N'):
            print("输入的字符不为单个字符:'Y' or 'N',请重新输入:\n\n")
            continue
        # keyword = "舞蹈"
        # audio_condition = 'N'
        bilibili_spider = BiliBili_Spider(keyword, page, audio_condition)
        bilibili_spider.run()
        # page = page + 1

我这里把进程数默认设置为20,当然,如果你在本程序上进行修改,把程序修改为自动下载多页视频的时候,可以根据自己电脑cpu占用情况来适当调大(如下图所示)在这里插入图片描述

发布了41 篇原创文章 · 获赞 85 · 访问量 9335

猜你喜欢

转载自blog.csdn.net/weixin_44449518/article/details/104100104