Crawler combat study notes_7 [Actual combat] Simulation download page video (template)

import requests     # 导入requests模块
import re           # 导入re模块
import os           # 导入系统os模块

# 实现发送网络请求,返回响应结果
def send_request(url,headers):
    response = requests.get(url=url,headers=headers)  # 发送网络请求
    if response.status_code==200:
        html_str = response.text                 # 获取HTML代码
        return html_str                          # 返回HTML代码

# 实现获取视频标题、地址并下载视频
def download_video(html_str):
    video_urls = re.findall('<source src="(.*?)"',html_str)   # 提取当前页面中所有视频地址
    # 提取所有视频的标题文字
    titles = re.findall('<div class="content">\n<span>\n\n\n(.*?)\n\n</span>\n\n</div>',html_str)
    for title,video_url in zip(titles,video_urls):
        size = 0  # 记录每次写入大小的变量
        video_url= 'https:'+video_url    # 将视频地址补充完整
        video_response = requests.get(url=video_url,headers=headers)    # 向视频下载地址发送网络请求
        chunk_size = 10                             # 单次请求最大值
        content_size = int(video_response.headers['content-length'])    # 获取视频内容的总大小
        if not os.path.exists('video'):               # 判断pic文件夹是否存在
             os.mkdir('video')                        # 创建pic文件夹
        with open('video/'+title+'.mp4','wb') as f:
            # 循环写入,实现一段一段的写
            for data in video_response.iter_content(chunk_size=chunk_size):
                f.write(data)             # 写入视频数据
                f.flush()                 # 刷新缓存
                size+=len(data)           # 叠加计算每次写入数据的变化
                # 打印下载进度
                print('\r'+title+'视频文件下载进度:%d%%(%d/%d)'%(float(size/content_size*100),size,content_size),end=' ')

if __name__ == '__main__':
    # 定义请求地址
    url = 'https://www.某视频网站.com/video/'
    # 定义请求头信息
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4503.5 Safari/537.36'}
    html_str = send_request(url=url,headers=headers)  # 调用发送网络请求的方法
    download_video(html_str=html_str)                 # 调用下载视频的方法

Guess you like

Origin blog.csdn.net/qq_39237205/article/details/123807505