Python web crawler, multi-task download video

It is too troublesome to find a website on the Internet and download videos one by one manually, what should I do? Using a certain mine, it is out.
The web crawler multi-threaded video download steps:

  • Introduce requests to access web content, and use regular parsing to extract url
  • Analyze the html page to get the mp4 address
  • Store the url in a thread-safe queue
  • Get the mp4 address in the queue with multiple threads and download it at the same time
import requests
import re
import os
import queue
import threading
import shutil

def download_start():
    download_url_queue = queue.Queue(3)
    mp4_code_set = set()
    page = 10
    store_location = '/Users/Downloads/.dyxx/'    #存储的地址
    download_site_home = "https://xxxxxx.com/"   #下载视的地址,这个需要你自己到网上发掘了
    mp4_api_url = 'https://api.xxxxxx.com/get-mp4-url?code='  #通过下载片源地址获取code, 通过code获得播放mp4的地址

    def download():
        while True:
            if not download_url_queue.empty():
                mp4_url = download_url_queue.get()
                try:
                    file_path = store_location + mp4_url[-15:]
                    if not os.path.exists(file_path):
                        print('Download start::::' + mp4_url)
                        res_header = requests.head(mp4_url)
                        if res_header.headers['Content-Type'] == 'video/mp4':
                            with open(file_path, "wb") as f, requests.get(mp4_url, stream=True) as res:
                                shutil.copyfileobj(res.raw, f)
                        print('Download end::::' + mp4_url)
                except Exception as ee:
                    print(str(ee))
                    pass

    for t in range(5):
        threading.Thread(target=download).start()

    while True:
        try:
            download_pages = download_site_home+'?page=' + str(page)
            res = requests.get(download_pages)
            if res.status_code == 200:
                re_href = re.compile(r'href="/\d{4}/[^"]*')
                all_href = re_href.findall(res.text)
                all_href.reverse()
                all_href_set = set(all_href[15:-15])
                for href_item in all_href_set:
                    play_page = download_site_home + href_item.replace('href="/', '')
                    play_page_res = requests.get(play_page)
                    if play_page_res.status_code == 200:
                        play_page_text = play_page_res.text
                        re_play_code = re.compile(r'data-code="[^"]*')
                        mp4_play_codes = re_play_code.findall(play_page_text)
                        mp4_play_codes_set = set(mp4_play_codes)
                        for code in mp4_play_codes_set:
                            param_code = code.replace('data-code="', '')
                            if param_code in mp4_code_set:
                                break
                            else:
                                mp4_code_set.add(param_code)
                            mp4url = mp4_api_url + param_code
                            mp4res = requests.get(mp4url)
                            if mp4res.status_code == 200:
                                file_path = store_location + mp4res.text[-15:]
                                if os.path.exists(file_path):
                                    break;
                                print(mp4res.text + '   ' + param_code)
                                download_url_queue.put(mp4res.text)
            page = page + 1
        except Exception as e:
            pass


if __name__ == '__main__':
    download_start()

Guess you like

Origin blog.csdn.net/zhou8622/article/details/126805695