grab m3u8 video

grab m3u8 video

1. Thinking analysis

View URL: https://www.9meiju.cc/mohuankehuan/shandianxiadibaji/1-1.html

  1. Open the URL to analyze whether the current video is composed of multiple clips or a single video. If it is a single video, find the URL and download it directly. If it is a video with multiple clips, you need to find the file of the clip for processing. In this case, m3u8 as an example
  2. After finding the m3u8 file, download it. After downloading, open the file and analyze whether a secret key is needed. If a secret key is needed, download the secret key according to the secret key address, and then download all ts files.
  3. Merge all videos

2. Realize

Analyze index.m3u8

  • Two m3u8 files were found through network search

    The urls are

    https://new.qqaku.com/20211117/iHVkqQMI/index.m3u8

    https://new.qqaku.com/20211117/iHVkqQMI/2523kb/hls/index.m3u8

    The url address of the second m3u8 request is included in the content returned by analyzing the first index.m3u8 request

    That is to say, the first index.m3u8url request returns the address of the second index.m3u8 file, and the splicing request for the second index.m3u8 returns the address content of all current ts files.

    Now we have analyzed the address of the second real index.m3u8, but where does the first address come from? Don’t panic, let’s find out where the first url comes from.

    [The external link image transfer failed. The source site may have an anti-leeching mechanism. It is recommended to save the image and upload it directly (img-HD4wiRVb-1689332185511) (grab m3u8 video.assets/image-20220708105559202.png)]

    [The external link image transfer failed. The source site may have an anti-leeching mechanism. It is recommended to save the image and upload it directly (img-Zihoa6X8-1689332185512) (grab m3u8 video.assets/image-20220708105618510.png)]

  • Find the url address of the first index.m3u8

    open source

    It is found that the url exists in the js in the source code of the page, and the location is known, and it can be obtained through regular matching in the code

    Now let's take a look at the idea. We can find the url of the first index.m3u8 through the page source code, return the url content containing the second index.m3u8 file through the request, splice it, request the url of the second m3u8, and return All ts content

    [The external link image transfer failed. The source site may have an anti-leeching mechanism. It is recommended to save the image and upload it directly (img-ePdIlB5E-1689332185512) (grab m3u8 video.assets/image-20220708110048589.png)]

3. Code implementation

3.1 Get the last m3u8 url address

import re
from urllib.parse import urljoin

import requests

headers = {
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

session = requests.Session()
session.get('https://www.9meiju.cc/', headers=headers)

url = 'https://www.9meiju.cc/mohuankehuan/shandianxiadibaji/1-2.html'
response = session.get(url, headers=headers)
response.encoding = 'UTF-8'
data = response.text
# print(data)
'''
<script>
var zanpiancms_player = {"player":"\/public\/","url":"https:\/\/new.qqaku.com\/20211124\/nLwncbZW\/index.m3u8","next":"https:\/\/www.9meiju.cc\/mohuankehuan\/shandianxiadibaji\/1-3.html","name":"wjm3u8","apiurl":null,"adtime":"0","adurl":"","copyright":0,"danmu":{"status":0}};
</script>
'''
# 正则抓取上面的源代码中的m3u8的url
m3u8_uri = re.search('"url":"(.+?index.m3u8)"', data).group(1).replace('\\', '')

# 写入文件 分析当前的页面源代码
with open('99.html', 'w', encoding='UTF-8') as f:
    # 写入response.content bytes二进制类型
    f.write(response.content.decode('UTF-8'))

# 请求可以获取index.m3u8文件
response = session.get(m3u8_uri, headers=headers)
with open('m3u8_uri.text', 'w', encoding='UTF-8') as f:
    # 写入response.content bytes二进制类型
    f.write(response.content.decode('UTF-8'))
response.encoding = 'UTF-8'
data = response.text

# 拆分返回的内容获取真整的index.m3u8文件的url
url = data.split('/', 3)[-1]
print(data)
print('m3u8_uri', m3u8_uri)
print('url', url)
print(urljoin(m3u8_uri, url))

3.2 Multi-threaded download of ts files and video merging

import time
import requests
import os
from concurrent.futures import ThreadPoolExecutor, wait

headers = {
    
    
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}


def down_video(url, i):
    '''
    下载ts文件
    :param url:
    :param i:
    :return:
    '''
    # print(url)
    # 下载ts文件
    resp = requests.get(url, headers=headers)
    with open(os.path.join(path, str(i)+'.ts'), mode="wb") as f3:
        f3.write(resp.content)
    print('{} 下载完成!'.format(url))


def download_all_videos(url, path):
    '''
    下载m3u8文件以及多线程下载ts文件
    :param url:
    :param path:
    :return:
    '''
    # 请求m3u8文件进行下载
    resp = requests.get(url, headers=headers)
    with open("first.m3u8", mode="w", encoding="utf-8") as f:
        f.write(resp.text)
    if not os.path.exists(path):
        os.mkdir(path)
    # 开启线程 准备下载
    pool = ThreadPoolExecutor(max_workers=50)
    # 1. 读取文件
    tasks = []
    i = 0
    with open("first.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            # 如果不是url 则走下次循环
            if line.startswith("#"):
                continue
            print(line, i)
            # 开启线程
            tasks.append(pool.submit(down_video, line.strip(), i))
            i += 1
    print(i)
    # 统一等待
    wait(tasks)


# 处理m3u8文件中的url问题
def do_m3u8_url(path, m3u8_filename="index.m3u8"):
    # 这里还没处理key的问题
    if not os.path.exists(path):
        os.mkdir(path)
    # else:
        # shutil.rmtree(path)
        # os.mkdir(path)
    with open(m3u8_filename, mode="r", encoding="utf-8") as f:
        data = f.readlines()

    fw = open(os.path.join(path, m3u8_filename), 'w', encoding="utf-8")
    abs_path = os.getcwd()
    i = 0
    for line in data:
        # 如果不是url 则走下次循环
        if line.startswith("#"):
            # 判断处理是存在需要秘钥
            fw.write(line)
        else:
            fw.write(f'{
      
      abs_path}/{
      
      path}/{
      
      i}.ts\n')
            i += 1
    

def merge(filePath, filename='output'):
    '''
    进行ts文件合并 解决视频音频不同步的问题 建议使用这种
    :param filePath:
    :return:
    '''
    os.chdir(path)
    cmd = f'ffmpeg -i index.m3u8 -c copy {
      
      filename}.mp4'
    os.system(cmd)



if __name__ == '__main__':
    # 抓取99美剧闪电侠
    # ts文件存储目录
    path = 'ts'
    url = 'https://new.qqaku.com/20211124/nLwncbZW/1100kb/hls/index.m3u8'
    # 下载m3u8文件以及ts文件
    download_all_videos(url, path)
    do_m3u8_url(path)
    # 文件合并
    merge(path, 'ts2')
    print('over')

Note: The tool currently used for video merging is ffmpeg. If you need to install it, check my other blog about the use of ffmpeg.

3.3 Merge and obtain the code of the above two code snippets

import re
from urllib.parse import urljoin
import requests
import os  # 执行cmd/控制台上的命令
from concurrent.futures import ThreadPoolExecutor, wait
from retrying import retry


def get_m3u8_url(url):
    '''
    获取页面中m3u8的url
    :param url: 电影页面的url
    :return:
    '''
    session = requests.Session()
    # 访问首页获取cookie
    session.get('https://www.9meiju.cc/', headers=headers)
    # url = 'https://www.9meiju.cc/mohuankehuan/shandianxiadibaji/1-2.html'
    response = session.get(url, headers=headers)
    response.encoding = 'UTF-8'
    data = response.text
    # print(data)
    m3u8_uri = re.search('"url":"(.+?index.m3u8)"', data).group(1).replace('\\', '')

    # 写入文件 分析当前的页面源代码
    # with open('99.html', 'w', encoding='UTF-8') as f:
        # 写入response.content bytes二进制类型
        # f.write(response.content.decode('UTF-8'))

    # 请求可以获取index.m3u8文件
    response = session.get(m3u8_uri, headers=headers)
    # with open('m3u8_uri.text', 'w', encoding='UTF-8') as f:
        # 写入response.content bytes二进制类型
        # f.write(response.content.decode('UTF-8'))
    response.encoding = 'UTF-8'
    data = response.text
    # 拆分返回的内容获取真整的index.m3u8文件的url
    # 注意 一定要strip
    url = data.split('/', 3)[-1].strip()
    print(data)
    print('m3u8_uri', m3u8_uri)
    url = urljoin(m3u8_uri, url)
    print('url', url)
    return url

@retry(stop_max_attempt_number=3)
def down_video(url, i):
    '''
    下载ts文件
    :param url:
    :param i:
    :return:
    '''
    # print(url)
    # 下载ts文件
    # try:
    resp = requests.get(url, headers=headers)
    with open(os.path.join(path, str(i)+'.ts'), mode="wb") as f3:
        f3.write(resp.content)
    assert resp.status_code == 200


def download_all_videos(url, path):
    '''
    下载m3u8文件以及多线程下载ts文件
    :param url:
    :param path:
    :return:
    '''
    # 请求m3u8文件进行下载
    resp = requests.get(url, headers=headers)
    with open("index.m3u8", mode="w", encoding="utf-8") as f:
        f.write(resp.content.decode('UTF-8'))
    if not os.path.exists(path):
        os.mkdir(path)
    # 开启线程 准备下载
    pool = ThreadPoolExecutor(max_workers=50)
    # 1. 读取文件
    tasks = []
    i = 0
    with open("index.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            # 如果不是url 则走下次循环
            if line.startswith("#"):
                continue
            print(line, i)
            # 开启线程
            tasks.append(pool.submit(down_video, line.strip(), i))
            i += 1
    print(i)
    # 统一等待
    wait(tasks)
    # 如果阻塞可以给一个超时参数
    # wait(tasks, timeout=1800)


def do_m3u8_url(path, m3u8_filename="index.m3u8"):
    # 这里还没处理key的问题
    if not os.path.exists(path):
        os.mkdir(path)
    # else:
    # shutil.rmtree(path)
    # os.mkdir(path)
    with open(m3u8_filename, mode="r", encoding="utf-8") as f:
        data = f.readlines()

    fw = open(os.path.join(path, m3u8_filename), 'w', encoding="utf-8")
    abs_path = os.getcwd()
    i = 0
    for line in data:
        # 如果不是url 则走下次循环
        if line.startswith("#"):
            fw.write(line)
        else:
            fw.write(f'{
      
      abs_path}/{
      
      path}/{
      
      i}.ts\n')
            i += 1

def merge(path, filename='output'):
    '''
    进行ts文件合并 解决视频音频不同步的问题 建议使用这种
    :param filePath:
    :return:
    '''
    os.chdir(path)
    cmd = f'ffmpeg -i index.m3u8 -c copy {
      
      filename}.mp4'
    os.system(cmd)

if __name__ == '__main__':
    headers = {
    
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

    # 电影的url 返回index.m3u8的url地址
    url = get_m3u8_url('https://www.9meiju.cc/mohuankehuan/shandianxiadibaji/1-2.html')

    # ts文件存储目录
    path = 'ts'
    # 下载m3u8文件以及ts文件
    download_all_videos(url, path)
    do_m3u8_url(path)
    # 文件合并
    merge(path, '第二集')
    print('over')

4. Matters needing attention

4.1 Description

When obtaining the content of the index.m3u8 file, some file contents will display...jpg/png, but not ...ts. In this case, you need to handle the content separately as follows:

[The external link image transfer failed. The source site may have an anti-leeching mechanism. It is recommended to save the image and upload it directly (img-EAZvDXnp-1689332185513) (grab m3u8 video.assets/image-20220708111809978.png)]

In this case, the above code cannot be used for normal merging, and the merged video cannot be played

However, using ffprobe analysis, it was found that it was recognized as png, which led to the failure of normal splicing

[The external link image transfer failed. The source site may have an anti-leeching mechanism. It is recommended to save the image and upload it directly (img-PD9hHJwg-1689332185514) (grab m3u8 video.assets/image-20220708112001167.png)]

In this case, you only need to fill the header part of the PNG file with FF to deal with the problem

The effect after filling is as shown in the figure

[The external link image transfer failed. The source site may have an anti-leeching mechanism. It is recommended to save the image and upload it directly (img-aopwgJdW-1689332185514) (grab m3u8 video.assets/image-20220708112029091.png)]

4.2 Processing with code

# 解析伪装成png的ts
def resolve_ts(src_path, dst_path):
    '''
    如果m3u8返回的ts文件地址为
    https://p1.eckwai.com/ufile/adsocial/7ead0935-dd4f-4d2f-b17d-dd9902f8cc77.png
    则需要下面处理后 才能进行合并
    原因在于 使用Hexeditor打开后,发现文件头被描述为了PNG
    在这种情况下,只需要将其中PNG文件头部分全部使用FF填充,即可处理该问题
    :return:
    '''
    if not os.path.exists(dst_path):
        os.mkdir(dst_path)
    file_list = sorted(os.listdir(src_path), key=lambda x: int(x.split('.')[0]))
    for i in file_list:
        origin_ts = os.path.join(src_path, i)
        resolved_ts = os.path.join(dst_path, i)
        try:
            infile = open(origin_ts, "rb")  # 打开文件
            outfile = open(resolved_ts, "wb")  # 内容输出
            data = infile.read()
            outfile.write(data)
            outfile.seek(0x00)
            outfile.write(b'\xff\xff\xff\xff')
            outfile.flush()
            infile.close()  # 文件关闭
            outfile.close()
        except:
            pass
        print('resolve ' + origin_ts + ' success')

4.3 Complete code

import shutil
import time
from urllib.parse import urljoin

import requests
import os
import re
from concurrent.futures import ThreadPoolExecutor, wait


def get_m3u8_url(url):
    '''
    获取页面中m3u8的url
    :param url: 电影页面的url
    :return:
    '''
    session = requests.Session()
    # 访问首页获取cookie
    session.get('https://www.9meiju.cc/', headers=headers)
    # url = 'https://www.9meiju.cc/mohuankehuan/shandianxiadibaji/1-2.html'
    response = session.get(url, headers=headers)
    response.encoding = 'UTF-8'
    data = response.text
    # print(data)
    m3u8_uri = re.search('"url":"(.+?index.m3u8)"', data).group(1).replace('\\', '')


    # 请求可以获取index.m3u8文件
    response = session.get(m3u8_uri, headers=headers)
    response.encoding = 'UTF-8'
    data = response.text
    # 拆分返回的内容获取真整的index.m3u8文件的url
    # 注意 一定要strip
    url = data.split('/', 3)[-1].strip()
    print(data)
    print('m3u8_uri', m3u8_uri)
    url = urljoin(m3u8_uri, url)
    print('url', url)
    return url

def down_video(url, i):
    '''
    下载ts文件
    :param url:
    :param i:
    :return:
    '''
    # print(url)
    # 下载ts文件
    resp = requests.get(url, headers=headers)
    with open(os.path.join(path, str(i)+'.ts'), mode="wb") as f3:
        f3.write(resp.content)
    # print('{} 下载完成!'.format(url))


def download_all_videos(url, path):
    '''
    下载m3u8文件以及多线程下载ts文件
    :param url:
    :param path:
    :return:
    '''
    # 请求m3u8文件进行下载
    resp = requests.get(url, headers=headers)
    with open("index.m3u8", mode="w", encoding="utf-8") as f:
        f.write(resp.content.decode('UTF-8'))
    if not os.path.exists(path):
        os.mkdir(path)
    # 开启线程 准备下载
    pool = ThreadPoolExecutor(max_workers=50)
    # 1. 读取文件
    tasks = []
    i = 0
    with open("index.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            # 如果不是url 则走下次循环
            if line.startswith("#"):
                continue
            print(line, i)
            # 开启线程
            tasks.append(pool.submit(down_video, line.strip(), i))
            i += 1
    print(i)
    # 统一等待
    wait(tasks)



# 解析伪装成png的ts
def resolve_ts(src_path, dst_path):
    '''
    如果m3u8返回的ts文件地址为
    https://p1.eckwai.com/ufile/adsocial/7ead0935-dd4f-4d2f-b17d-dd9902f8cc77.png
    则需要下面处理后 才能进行合并
    原因在于 使用Hexeditor打开后,发现文件头被描述为了PNG
    在这种情况下,只需要将其中PNG文件头部分全部使用FF填充,即可处理该问题
    :return:
    '''
    if not os.path.exists(dst_path):
        os.mkdir(dst_path)
    file_list = sorted(os.listdir(src_path), key=lambda x: int(x.split('.')[0]))
    for i in file_list:
        origin_ts = os.path.join(src_path, i)
        resolved_ts = os.path.join(dst_path, i)
        try:
            infile = open(origin_ts, "rb")  # 打开文件
            outfile = open(resolved_ts, "wb")  # 内容输出
            data = infile.read()
            outfile.write(data)
            outfile.seek(0x00)
            outfile.write(b'\xff\xff\xff\xff')
            outfile.flush()
            infile.close()  # 文件关闭
            outfile.close()
        except:
            pass
        """
        else:
            # 删除目录
            shutil.rmtree(src_path)
            # 将副本重命名为正式文件
            os.rename(dst_path, dst_path.rstrip('2'))
        """
        print('resolve ' + origin_ts + ' success')


# 处理m3u8文件中的url问题
def do_m3u8_url(path, m3u8_filename="index.m3u8"):
    # 这里还没处理key的问题
    if not os.path.exists(path):
        os.mkdir(path)

    with open(m3u8_filename, mode="r", encoding="utf-8") as f:
        data = f.readlines()

    fw = open(os.path.join(path, m3u8_filename), 'w', encoding="utf-8")
    abs_path = os.getcwd()
    i = 0
    for line in data:
        # 如果不是url 则走下次循环
        if line.startswith("#"):
            fw.write(line)
        else:
            fw.write(f'{
      
      abs_path}/{
      
      path}/{
      
      i}.ts\n')
            i += 1

def merge(path, filename='output'):
    '''
    进行ts文件合并 解决视频音频不同步的问题 建议使用这种
    :param filePath:
    :return:
    '''
    os.chdir(path)
    cmd = f'ffmpeg -i index.m3u8 -c copy {
      
      filename}.mp4'
    os.system(cmd)


if __name__ == '__main__':
    headers = {
    
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
    }
    url = get_m3u8_url('https://www.9meiju.cc/mohuankehuan/shandianxiadibaji/1-20.html')
    # 抓取99美剧闪电侠
    # ts文件存储目录
    path = 'ts'
    # 下载m3u8文件以及ts文件
    download_all_videos(url, path)
    # 合并png的ts文件
    src_path = path
    dst_path = path+'2'
    resolve_ts(src_path, dst_path)
    do_m3u8_url(dst_path)
    merge(dst_path, '闪电侠')
    print('over')

5. Decryption processing

  • What we talked about above are unencrypted ts files. These files can be played directly after downloading, but files encrypted by AES-128 cannot be played after downloading, so they need to be decrypted.

  • How to tell if encryption is needed? Observe whether the video website has m3u8 file transfer, download it and open it:

    No need to decrypt index.m3u8 file

    #EXTM3U
    #EXT-X-VERSION:3
    #EXT-X-TARGETDURATION:4
    #EXT-X-PLAYLIST-TYPE:VOD
    #EXT-X-MEDIA-SEQUENCE:0
    #EXTINF:3.086,
    https://hey05.cjkypo.com/20211215/FMbNtNzz/1100kb/hls/7qs6gJc0.ts
    #EXTINF:2.085,
    https://hey05.cjkypo.com/20211215/FMbNtNzz/1100kb/hls/rYpHhq0I.ts
    #EXTINF:2.085,
    https://hey05.cjkypo.com/20211215/FMbNtNzz/1100kb/hls/bfays5sw.ts
    

    Need to decrypt index.m3u8 file

    #EXT-X-VERSION:3
    #EXT-X-TARGETDURATION:1
    #EXT-X-PLAYLIST-TYPE:VOD
    #EXT-X-MEDIA-SEQUENCE:0
    #EXT-X-KEY:METHOD=AES-128,URI="/20220418/671fJxOB/2000kb/hls/key.key" # 当前路径为解密秘钥的位置  需要使用代码拼凑成完整路径 进行请求 域名+/20220418/671fJxOB/2000kb/hls/key.key
    #EXTINF:1.235,
    /20220418/671fJxOB/2000kb/hls/kj6uqHoP.ts  # 并且这里ts的url也要拼凑完整
    #EXTINF:1.001,
    /20220418/671fJxOB/2000kb/hls/ZXX8LYPa.ts
    #EXTINF:1.001,
    /20220418/671fJxOB/2000kb/hls/sOezpD2H.ts
    #EXTINF:1.001,
    ...
    
  • If your file is encrypted, then you also need a key file. The method of downloading the Key file is similar to the m3u8 file. As shown below, key.key is the key file we need to download. Note that there are 2 m3u8 files here, which need to be used. is an m3u8 file with a ts file hyperlink like above

  • Download all ts files , put all the downloaded ts files, m3u8, and key.key into a folder, rename the m3u8 file to index.m3u8, and rename key.key to key.m3u8. Change the URL in index.m3u8 to the key file of your local path, and change all ts to your local path.

    file path

    project/

    ​ ts/

    ​ 0.ts

    ​ 1.ts

    ​ …

    ​ index.m3u8

    ​ key.m3u8

    The modified index.m3u8 content is as follows:

    #EXTM3U
    #EXT-X-VERSION:3
    #EXT-X-TARGETDURATION:1
    #EXT-X-PLAYLIST-TYPE:VOD
    #EXT-X-MEDIA-SEQUENCE:0
    #EXT-X-KEY:METHOD=AES-128,URI="/Users/xialigang/PycharmProjects/爬虫/抓取带秘钥的电影/ts/key.m3u8"
    #EXTINF:1.235,
    /Users/xialigang/PycharmProjects/爬虫/抓取带秘钥的电影/ts/0.ts
    #EXTINF:1.001,
    /Users/xialigang/PycharmProjects/爬虫/抓取带秘钥的电影/ts/1.ts
    #EXTINF:1.001,
    /Users/xialigang/PycharmProjects/爬虫/抓取带秘钥的电影/ts/2.ts
    

    The code to process the contents of index.m3u8 is as follows

    import time
    from urllib.parse import urljoin
    
    import requests
    import os
    from concurrent.futures import ThreadPoolExecutor, wait
    import re
    
    headers = {
          
          
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
    }
    
    
    def down_video(url, i):
        '''
        下载ts文件
        :param url:
        :param i:
        :return:
        '''
        # print(url)
        # 下载ts文件
        resp = requests.get(url, headers=headers)
        with open(os.path.join(path, str(i) + '.ts'), mode="wb") as f3:
            f3.write(resp.content)
        # print('{} 下载完成!'.format(url))
    
    
    def download_all_videos(path, host):
        '''
        下载m3u8文件以及多线程下载ts文件
        :param url:
        :param path:
        :return:
        '''
        if not os.path.exists(path):
            os.mkdir(path)
        # 开启线程 准备下载
        pool = ThreadPoolExecutor(max_workers=50)
        # 1. 读取文件
        tasks = []
        i = 0
        with open("index.m3u8", mode="r", encoding="utf-8") as f:
            for line in f:
                # 如果不是url 则走下次循环
                if line.startswith("#"):
                    continue
                line = host + line
                print(line, i)
                # 开启线程
                tasks.append(pool.submit(down_video, line.strip(), i))
                i += 1
        # 统一等待
        wait(tasks)
    
    
    # 处理m3u8文件中的url问题
    def do_m3u8_url(url, path, m3u8_filename="index.m3u8"):
        # 这里还没处理key的问题
        if not os.path.exists(path):
            os.mkdir(path)
    
        with open(m3u8_filename, mode="r", encoding="utf-8") as f:
            data = f.readlines()
      
        fw = open(os.path.join(path, m3u8_filename), 'w')
        abs_path = os.getcwd()
        i = 0
        for line in data:
            # 如果不是url 则走下次循环
            if line.startswith("#"):
                # 判断处理是存在需要秘钥
                if line.find('URI') != -1:
                    line = re.sub('(#EXT-X-KEY:METHOD=AES-128,URI=")(.*?)"', f'\\1{
            
            os.path.join(abs_path, path)}/key.m3u8"',
                                  line)
                    host = url.rsplit('/', 1)[0]
                    # 爬取key
                    download_m3u8(host + '/key.key', os.path.join(path, 'key.m3u8'))
                fw.write(line)
            else:
                fw.write(f'{
            
            abs_path}/{
            
            path}/{
            
            i}.ts\n')
                i += 1
    
    def download_m3u8(url, m3u8_filename="index.m3u8", state=0):
        print('正在下载index.m3u8文件')
        resp = requests.get(url, headers=headers)
        with open(m3u8_filename, mode="w", encoding="utf-8") as f:
            f.write(resp.text)
    
    def merge(filePath, filename='output'):
        '''
        进行ts文件合并 解决视频音频不同步的问题 建议使用这种
        :param filePath:
        :return:
        '''
        os.chdir(path)
        cmd = f'ffmpeg -i index.m3u8 -c copy {
            
            filename}.mp4'
        os.system(cmd)
    
    def get_m3u8_data(first_m3u8_url):
        session = requests.Session()
    
        # 请求第一次m3u8de url
        resp = session.get(first_m3u8_url, headers=headers)
        resp.encoding = 'UTF-8'
        data = resp.text
      
        # 第二次请求m3u8文件地址 返回最终包含所有ts文件的m3u8
        second_m3u8_url = urljoin(first_m3u8_url, data.split('/', 3)[-1].strip())
        resp = session.get(second_m3u8_url, headers=headers)
        with open('index.m3u8', 'wb') as f:
            f.write(resp.content)
        return second_m3u8_url
    
    
    if __name__ == '__main__':
        # ts文件存储目录
        path = 'ts'
        # 带加密的ts文件的 index.m3u8  url
        url = 'https://s7.fsvod1.com/20220622/5LnZiDXn/index.m3u8'
        meu8_url = get_m3u8_data(url)
        # 下载m3u8文件以及ts文件
        host = 'https://s7.fsvod1.com'   # 主机地址  用于拼凑完整的ts路径和秘钥路径
        download_all_videos(path, host)
        do_m3u8_url(meu8_url, path)
    
        # 文件合并
        merge(path, '奇异博士')
        print('over')
    
  • This is done! We successfully decrypted and used ffmpeg to merge these ts video clips. The actual application scenario may be different from this. Specific analysis will be done on specific websites.

Guess you like

Origin blog.csdn.net/weixin_53909748/article/details/131729958