It is too troublesome to find a website on the Internet and download videos one by one manually, what should I do? Using a certain mine, it is out.
The web crawler multi-threaded video download steps:
- Introduce requests to access web content, and use regular parsing to extract url
- Analyze the html page to get the mp4 address
- Store the url in a thread-safe queue
- Get the mp4 address in the queue with multiple threads and download it at the same time
import requests
import re
import os
import queue
import threading
import shutil
def download_start():
download_url_queue = queue.Queue(3)
mp4_code_set = set()
page = 10
store_location = '/Users/Downloads/.dyxx/' #存储的地址
download_site_home = "https://xxxxxx.com/" #下载视的地址,这个需要你自己到网上发掘了
mp4_api_url = 'https://api.xxxxxx.com/get-mp4-url?code=' #通过下载片源地址获取code, 通过code获得播放mp4的地址
def download():
while True:
if not download_url_queue.empty():
mp4_url = download_url_queue.get()
try:
file_path = store_location + mp4_url[-15:]
if not os.path.exists(file_path):
print('Download start::::' + mp4_url)
res_header = requests.head(mp4_url)
if res_header.headers['Content-Type'] == 'video/mp4':
with open(file_path, "wb") as f, requests.get(mp4_url, stream=True) as res:
shutil.copyfileobj(res.raw, f)
print('Download end::::' + mp4_url)
except Exception as ee:
print(str(ee))
pass
for t in range(5):
threading.Thread(target=download).start()
while True:
try:
download_pages = download_site_home+'?page=' + str(page)
res = requests.get(download_pages)
if res.status_code == 200:
re_href = re.compile(r'href="/\d{4}/[^"]*')
all_href = re_href.findall(res.text)
all_href.reverse()
all_href_set = set(all_href[15:-15])
for href_item in all_href_set:
play_page = download_site_home + href_item.replace('href="/', '')
play_page_res = requests.get(play_page)
if play_page_res.status_code == 200:
play_page_text = play_page_res.text
re_play_code = re.compile(r'data-code="[^"]*')
mp4_play_codes = re_play_code.findall(play_page_text)
mp4_play_codes_set = set(mp4_play_codes)
for code in mp4_play_codes_set:
param_code = code.replace('data-code="', '')
if param_code in mp4_code_set:
break
else:
mp4_code_set.add(param_code)
mp4url = mp4_api_url + param_code
mp4res = requests.get(mp4url)
if mp4res.status_code == 200:
file_path = store_location + mp4res.text[-15:]
if os.path.exists(file_path):
break;
print(mp4res.text + ' ' + param_code)
download_url_queue.put(mp4res.text)
page = page + 1
except Exception as e:
pass
if __name__ == '__main__':
download_start()