First, to enhance efficiency of the crawling module requests
1, multi-threaded and multi-process (not recommended)
2, the thread pool or process pool (appropriate use)
3, single-threaded asynchronous coroutine + (recommended reptiles
Second, a video to the local crawling
import re import time import random import requests from lxml import etree start_time = time.time() url = "https://www.pearvideo.com/category_3" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } ex = 'srcUrl="(.*?)",vdoUrl=srcUrl' def request_video(url): "" " Sends a request to the video-linked " "" return requests.get (URL = URL, headers = headers) .content DEF save_video (Content): "", " save the binary data to the local video " "" video_name = STR ( the random.randint (100, 999)) + " .mp4 " with Open (video_name, ' WB ' ) AS F: f.write (Content) # Get Home Source page_text = requests.get (URL = URL, headers = headers) .text Tree = etree.HTML (page_text) li_list = tree.xpath ( ' // UL [@ class = "listvideo-list clearfix"]/li') video_url_list = List () for Li in li_list: detail_url = " https://www.pearvideo.com/ " + li.xpath ( ' ./div/a/@href ' ) [0] # Get the source of the video page = requests.get detail_page_text (URL = detail_url, headers = headers) .text # regular URL matching video video_url = the re.findall (EX, detail_page_text, re.S) [0] video_url_list.append (video_url) Content = request_video (video_url ) save_video (Content) Print ( " time-consuming: ", time.time() - start_time)