Lifting requests module efficiency crawling

First, to enhance efficiency of the crawling module requests

  1, multi-threaded and multi-process (not recommended)

  2, the thread pool or process pool (appropriate use)

  3, single-threaded asynchronous coroutine + (recommended reptiles

Second, a video to the local crawling

import re
import time
import random
import requests
from lxml import etree

start_time = time.time()

url = "https://www.pearvideo.com/category_3"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}

ex = 'srcUrl="(.*?)",vdoUrl=srcUrl'

def request_video(url):
    "" " 
    Sends a request to the video-linked 
    " "" 
    return requests.get (URL = URL, headers = headers) .content 

DEF save_video (Content):
     "", " 
    save the binary data to the local video 
    " "" 
    video_name = STR ( the random.randint (100, 999)) + " .mp4 " 
    with Open (video_name, ' WB ' ) AS F: 
        f.write (Content) 

        
# Get Home Source 
page_text = requests.get (URL = URL, headers = headers) .text 

Tree = etree.HTML (page_text) 
li_list = tree.xpath ( ' // UL [@ class = "listvideo-list clearfix"]/li')

video_url_list = List ()
 for Li in li_list: 
    detail_url = " https://www.pearvideo.com/ " + li.xpath ( ' ./div/a/@href ' ) [0] 
    
    # Get the source of the video page 
    = requests.get detail_page_text (URL = detail_url, headers = headers) .text 
    
    # regular URL matching video 
    video_url = the re.findall (EX, detail_page_text, re.S) [0] 
    video_url_list.append (video_url) 
    
    Content = request_video (video_url ) 
    save_video (Content) 


Print ( " time-consuming: ", time.time() - start_time)

Guess you like

Origin www.cnblogs.com/youhongliang/p/12177380.html