使用python下载简单的m3u8视频
流程
准备一些数据
import re
import requests
@property
def url(self):
return self._url
def _get_m3u8_content(self):
if self._m3u8_content is None:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
page = requests.get(self._url, headers=headers)
if page is None:
raise RuntimeError("can't get url's content")
self._m3u8_content = page.text
return self._m3u8_content
def _get_head_url(self):
if self._head_url is None:
find = re.findall(r'(.*/).*\.m3u8?', self._url)
if find:
self._head_url = find[0]
else:
raise RuntimeError("can't get head url")
return self._head_url
def _get_urls(self):
if self._urls is None:
urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
if urls:
if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
head_url = self._get_head_url()
urls = list(map(lambda x: head_url + x, urls))
else:
raise RuntimeError("can't find urls")
self._urls = urls
return self._urls
解密
def _get_is_encrypted(self):
if self._is_encrypted is None:
if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
self._is_encrypted = True
else:
self._is_encrypted = False
return self._is_encrypted
def _get_encrypted_line(self):
if self._encrypted_line is None:
find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
if find:
self._encrypted_line = find[0]
else:
raise RuntimeError("can't get encrypted line")
return self._encrypted_line
def _get_encrypted_key(self):
if self._encrypted_key is None:
find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
head_url = self._get_head_url()
key = None
if find:
key = find[0]
if re.match(head_url, find) is None:
key = head_url + key
else:
key = head_url + 'key.key'
req = requests.get(key)
if req:
self._encrypted_key = req.content
else:
raise RuntimeError("can't get encrypted key")
return self._encrypted_key
def _get_encrypted_iv(self):
if self._encrypted_iv is None:
find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
if find:
self._encrypted_iv = find[0]
else:
raise RuntimeError("can't get encrypted iv")
return self._encrypted_iv
def _get_decrypt_content(self, content, key, iv=None):
aes = None
if iv is None:
aes = AES.new(key, AES.MODE_ECB)
else:
aes = AES.new(key, AES.MODE_CBC, iv)
content = aes.decrypt(content)
return content
def _decrypt_content(self, content):
if self._get_is_encrypted():
key = self._encrypted_key()
iv = self._encrypted_iv()
content = self._get_decrypt_content(content, key, iv)
return content
多进程异步下载
- 将程序分为三个部分,异步下载段视频、合并段视频、写入到文件
- 使用进程是为了利用多核cpu,每个进程负责一个部分
进程间共用的数据
self._data = m.dict({
'get': [
],
'max_count': len(self._get_urls()),
'write_count': None,
})
异步下载段视频部分
async def _get_rep(self, session, segment_url, count):
if count <= 0:
return None
count -= 1
try:
async with session.get(segment_url) as rep:
return rep
except RuntimeError:
return self._get_rep(session, segment_url, count)
def _process_bar(self, cur, end):
print("\r", end='')
print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
'▋' * int((cur / end) * 100),
end='')
sys.stdout.flush()
async def _async_get_segment_content(self, segment_url, semaphore, index):
async with semaphore:
async with aiohttp.ClientSession() as session:
try:
async with session.get(segment_url) as rep:
if rep:
content = await rep.read()
content = self._decrypt_content(content)
with self._lock:
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': content
}
)
self._data['get'] = data
except (Exception, RuntimeError):
with self._lock:
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': b''
}
)
self._data['get'] = data
self._process_bar(index + 1, self._data['max_count'])
return content
def _get_all_contents(self):
loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore(self._run_count)
urls = self._get_urls()
tasks = []
for k, v in enumerate(urls):
task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
- 遗憾:
- 当session.get发生错误时没有修改成重连几次
- 无法用列表生成式生成任务
- aiohttp.ClientSession()返回的变量没有复用
合并段视频部分
def _merge_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 1:
time.sleep(2)
continue
with self._lock:
for k, v in enumerate(self._data['get']):
for i, j in enumerate(self._data['get']):
if k == i:
continue
if v['end'] + 1 == j['start']:
get_data = self._data['get']
get_data[k]['content'] += get_data[i]['content']
get_data[k]['end'] = get_data[i]['end']
get_data.pop(i)
self._data['get'] = get_data
break
else:
continue
break
time.sleep(1)
写入到文件部分
def _write_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 0:
continue
content = None
with self._lock:
for k, v in enumerate(self._data['get']):
if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
content = v['content']
self._data['write_count'] = v['end']
get_list = self._data['get']
get_list.pop(k)
self._data['get'] = get_list
break
if content:
with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
f.write(content)
time.sleep(0.05)
全部代码
import asyncio
import os.path
import re
import sys
import time
from multiprocessing import Lock, Manager, Process
import aiohttp
import requests
from Crypto.Cipher import AES
class M3u8(object):
"""
初始化M3u8对象
参数说明:
------------------------
m3u8_url : str
m3u8的链接
run_count : int
同一时间内最多请求的数量
save_file_dir : str
保存文件的目录
save_file_name : str
保存文件的名称
------------------------
"""
def __init__(self, m3u8_url, run_count, save_file_dir=None, save_file_name=None) -> None:
self._m3u8_url = m3u8_url
self._save_file_dir = save_file_dir
self._save_file_name = save_file_name
self._run_count = run_count
self._m3u8_content = None
self._head_url = None
self._is_encrypted = None
self._encrypted_line = None
self._encrypted_key = None
self._encrypted_iv = None
self._urls = None
if save_file_dir is None:
save_file_dir = './'
if save_file_name is None:
save_file_name = 'mv.mp4'
if not os.path.exists(save_file_dir):
os.mkdir(save_file_dir)
if os.path.exists(os.path.join(save_file_dir, save_file_name)):
os.remove(os.path.join(save_file_dir, save_file_name))
def _get_m3u8_content(self):
if self._m3u8_content is None:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
}
page = requests.get(self._m3u8_url, headers=headers)
if page is None:
raise RuntimeError("can't get url's content")
self._m3u8_content = page.text
return self._m3u8_content
def _get_head_url(self):
if self._head_url is None:
find = re.findall(r'(.*/).*\.m3u8?', self._m3u8_url)
if find:
self._head_url = find[0]
else:
raise RuntimeError("can't get head url")
return self._head_url
def _get_is_encrypted(self):
if self._is_encrypted is None:
if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
self._is_encrypted = True
else:
self._is_encrypted = False
return self._is_encrypted
def _get_encrypted_line(self):
if self._encrypted_line is None:
find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
if find:
self._encrypted_line = find[0]
else:
raise RuntimeError("can't get encrypted line")
return self._encrypted_line
def _get_encrypted_key(self):
if self._encrypted_key is None:
find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
head_url = self._get_head_url()
key = None
if find:
key = find[0]
if re.match(head_url, find) is None:
key = head_url + key
else:
key = head_url + 'key.key'
req = requests.get(key)
if req:
self._encrypted_key = req.content
else:
raise RuntimeError("can't get encrypted key")
return self._encrypted_key
def _get_encrypted_iv(self):
if self._encrypted_iv is None:
find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
if find:
self._encrypted_iv = find[0]
else:
raise RuntimeError("can't get encrypted iv")
return self._encrypted_iv
def _get_decrypt_content(self, content, key, iv=None):
aes = None
if iv is None:
aes = AES.new(key, AES.MODE_ECB)
else:
aes = AES.new(key, AES.MODE_CBC, iv)
content = aes.decrypt(content)
return content
def _decrypt_content(self, content):
if self._get_is_encrypted():
key = self._encrypted_key()
iv = self._encrypted_iv()
content = self._get_decrypt_content(content, key, iv)
return content
def _get_urls(self):
if self._urls is None:
urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
if urls:
if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
head_url = self._get_head_url()
urls = list(map(lambda x: head_url + x, urls))
else:
raise RuntimeError("can't find urls")
self._urls = urls
return self._urls
def _process_bar(self, cur, end):
print("\r", end='')
print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
'▋' * int((cur / end) * 100),
end='')
sys.stdout.flush()
async def _async_get_segment_content(self, segment_url, semaphore, index):
async with semaphore:
async with aiohttp.ClientSession() as session:
try:
async with session.get(segment_url) as rep:
if rep:
content = await rep.read()
content = self._decrypt_content(content)
with self._lock:
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': content
}
)
self._data['get'] = data
except (Exception, RuntimeError):
with self._lock:
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': b''
}
)
self._data['get'] = data
self._process_bar(index + 1, self._data['max_count'])
return content
def _get_all_contents(self):
loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore(self._run_count)
urls = self._get_urls()
tasks = []
for k, v in enumerate(urls):
task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
def _merge_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 1:
time.sleep(2)
continue
with self._lock:
for k, v in enumerate(self._data['get']):
for i, j in enumerate(self._data['get']):
if k == i:
continue
if v['end'] + 1 == j['start']:
get_data = self._data['get']
get_data[k]['content'] += get_data[i]['content']
get_data[k]['end'] = get_data[i]['end']
get_data.pop(i)
print('\n合并视频-'+str(v['start'])+'<<'+str(j['start']))
self._data['get'] = get_data
break
else:
continue
break
time.sleep(1)
def _write_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 0:
continue
content = None
with self._lock:
for k, v in enumerate(self._data['get']):
if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
content = v['content']
self._data['write_count'] = v['end']
get_list = self._data['get']
get_list.pop(k)
print('\n写入视频-' + str(v['start']))
self._data['get'] = get_list
break
if content:
with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
f.write(content)
time.sleep(0.05)
def run(self):
self._lock = Lock()
with Manager() as m:
self._data = m.dict({
'get': [],
'max_count': len(self._get_urls()),
'write_count': None,
})
tasks = (self._get_all_contents, self._merge_segment_content, self._write_segment_content)
processes = [Process(target=v) for v in tasks]
for p in processes:
p.start()
for p in processes:
p.join()
if __name__ == '__main__':
url = r'xxx/index.m3u8'
m3u8 = M3u8(url, 3, './mv', 'mv.mp4')
m3u8.run()
- 遗憾:
- 无法使用进程池
- 无法直接修改进程共享变量的最深层数据
补充
同步请求
def _download_single_mv(self, url):
req = requests.get(url)
content = None
if req:
content = req.content
else:
raise RuntimeError(f"can't download sigle mv, url : {
url}")
content = self._decrypt_content(content)
return content
def _download_all_ts(self, save_file_dir):
urls = self._get_urls()
for k, url in enumerate(urls):
content = self._download_single_mv(url)
with open(save_file_dir + '/0' + str(k) + '.ts', 'wb') as f:
f.write(content)
def _download_all_ts_and_merge(self, save_file_dir, save_file_name):
urls = self._get_urls()
with open(save_file_dir + '/' + save_file_name, 'ab') as f:
for url in urls:
content = self._download_single_mv(url)
f.write(content)
异步请求
def _async_write_file(self, content, index):
_save_file_dir = self._save_file_dir
with open(_save_file_dir + '/0' + str(index) + '.ts', 'wb') as f:
f.write(content)
async def _async_get(self, url, semaphore, index):
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as rep:
content = await rep.read()
content = self._decrypt_content(content)
self._async_write_file(content, index)
return content
def _async_run(self, count):
loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore(count)
tasks = []
urls = self._get_urls()
for k, url in enumerate(urls):
task = asyncio.ensure_future(self._async_get(url, semaphore, k), loop=loop)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
合并
def _del_all_ts_files(self, del_files_dir):
files = [x for x in os.listdir(del_files_dir) if os.path.isfile(del_files_dir + '/' + x) and os.path.splitext(del_files_dir + '/' + x)[1] == '.ts']
for file in files:
os.remove(del_files_dir + '/' + file)
- zipfile
def _zipfile_merge(self, save_file_dir, save_file_name, is_del=True):
files = os.listdir(save_file_dir)
with zipfile.ZipFile(os.path.join(save_file_dir, save_file_name), 'a') as z:
for file in files:
z.write(save_file_dir + '/' + file)
if is_del is True:
self._del_all_ts_files(save_file_dir)
- copy 命令 合并文件
def _copy_command_merge(self, save_file_dir, save_file_name, is_del=True):
file_name = save_file_name
cur_dir = os.path.abspath('.')
os.chdir(save_file_dir)
os.system("copy /b *.ts new.tmp")
os.rename("new.tmp", file_name)
if is_del is True:
os.chdir(cur_dir)
self._del_all_ts_files(save_file_dir)
- ffmpeg 合并文件
def _ffmpeg_merge(self, save_file_dir, save_file_name, is_del=True):
m3u8_content = self._get_m3u8_content()
m3u8_content = re.sub(r'(#EXT-X-KEY:METHOD.*\n)', '', m3u8_content)
urls = re.findall(r'(h.*\.ts)', m3u8_content)
for k, v in enumerate(urls):
m3u8_content = re.sub(v, k, m3u8_content, count=1)
cur_dir = os.path.abspath('.')
os.chdir(save_file_dir)
with open('index.m3u8', 'wb') as f:
f.write(m3u8_content)
os.system("ffmpeg -i index.m3u8 -c copy " + save_file_name)
os.chdir(cur_dir)
if is_del is True:
self._del_all_ts_files(save_file_dir)
- 下载段视频到内存后直接追加到磁盘中的视频文件,不使用临时.ts文件