Crawling small video site and download video - batch download video games library website

First statement: This small video nor that little video, you can unspoken

Goal: https://www.vmovier.com/ name of this site is like asking the library field

 

"""
First sends a request to https://www.vmovier.com/
Get a response, parse the response
All title and filter out links
To send a request to filter out links to download video
Sending a request to the src attribute, acquisition response, saves the contents of the local
Note: This is a dynamic page is js data capture interface required
Use the packet capture tool Fiddler
Interface information: https:? //Www.vmovier.com/post/getbytab tab = new & page = 1 & pagepart = 2 & type = 0 & controller = index & last_postid = 58197
"""
import requests
from  bs4 import  BeautifulSoup
import time
import json
from lxml import etree
import re
from  selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Create a parameter object, to control opening in a non-Chrome interface mode 
chrome_options = the Options ()
 # open endless mode 
chrome_options.add_argument ( ' --headless ' )
 # disable the GPU 
chrome_options.add_argument ( ' --disable-GPU ' )
 # drive path 
path = R & lt ' D: \ chromedriver \ chromedriver.exe '

# Header is added as a global 
headers = {
     ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 '
}


# Here there is a jump feels like anti-climb mechanism so we once again filter function to filter out the url to deal with this after the jump jump 
DEF FanPa (r):
     # regularization get the src attribute 
    match_obj = re.compile (R & lt ' <Property Meta = "Article This article was: published_first" Content =. "new film field, (*)" /> ' )
    url = re.findall(match_obj,r.text)
    # print(url)
    # exit()
    return url


# Parse home, return all the title link 
DEF handle_title ():
     # Enter your desired number of pages crawled 
    Page = int (the INPUT ( " Please enter the number of pages you want to crawl taken: " ))
     Print ( " start to climb take page% s ..... " % page)
     # the capture interface to take over because it is dynamic pages capture interface to send data to the interface 
    # page = page pagepart = how many times each page refresh each page three times refresh are 123 could write a loop 
    for T in the Range (1, 4 ):
        URL = " https://www.vmovier.com/post/getbytab?tab=new&page=% & pagepart =% D S " % (Page, T)
         # Print (URL) 
        # Exit () 
        R & lt requests.get = (URL = url, headers = headers)
         # parses the content, because the returns json data directly parse json format to 
        # later found not strictly json format to use json objects out of the data section with a regular filter out 
        # our title and link needs are h1 tag is the title and the href 
        # the data into json Python object 
        obj = json.loads (r.text)
         # Print (obj) 
        # Exit () 
        # removed and all data related to video titles and url using regular 
        data = obj [ ' Data ']
        #print(data)
        # # exit()
        match_obj_url = re.compile(r'<a href="(.*)" title=".*全文" target="_blank">阅读全文...</a>')
        url = re.findall(match_obj_url,data)
        # print(url)
        # print(len(url))
        match_obj_title = re.compile(r'<a href=".*" title="(.*)全文" target="_blank">阅读全文...</a>')
        title = the re.findall (match_obj_title, data)
         # Print (title) 
        # Print (len (title)) 
        # Exit () 
        # cycles sequentially extracted data listing each of the video information 
        # where title information may be directly used, but not after the final splice url there are 2 hops url 
        for I in Range (0,15): # two data listing each 15 one- 
            end_title = title [I]
             # Print (end_title) 
            a_href = " HTTPS: //www.vmovier .com " + url [I]
             # Print (a_href) 
            # Exit () 
            # this is not the final process requires a jump video url
            r = requests.get(url=a_href,headers=headers)
            end_href = FanPa (r) # This is the penultimate layer url 
            # Print (end_href) 
            # Exit () 
            video_src = get_video_url (end_href)
             # Print (video_src) 
            # Exit () 
            # to start downloading a video found after the original address 
            Print ( " start S ... download% " % end_title)
            filepath = 'shipin/' + end_title +'.mp4'
            r = requests.get(url=video_src)
            with open(filepath,'wb')as fp:
                fp.write(r.content)
            Print ( " % S download completed " % end_title)


# Sending a request for acquiring the content parsing the content, obtaining the src 
DEF get_video_url (end_href):
     # Print (end_href) where # is a end_href only one list element so to add a subscript 
    # # Exit () 
    # R & lt requests.get = (URL end_href = [0], headers = headers) 
    # Print (r.text) 
    # Exit () 
    # here to print our return to the page I found no looking video link 
    # is likely to be another anti-climb mechanism so use the ultimate weapon headless browser to solve 
    Browser = webdriver.Chrome (executable_path = path, Options = chrome_options)
    browser.get(end_href[0])
    the time.sleep ( . 3 )
     # Get the source, to generate an object tree, and then look for the src attribute in the video 
    # Print (browser.page_source) 
    # Exit () 
    Soup = the BeautifulSoup (browser.page_source, ' lxml ' )
    video_src =soup.find('video',id="xpc_video")['src']
    # print(video_src)
    # exit()
    return video_src

DEF main ():
     # parse home, return all the title link 
    handle_title ()
   IF  __name__ == ' __main__ ' :
    main()

 

The main issues to be aware of are:

1, for this page generally lazy loading of its capture interfaces, filtered off and the title information returned from the interface url

2, this site has a jump process, anti-sense mechanisms like climbing, you can go through a step by step to determine whether the output HTML page is the page we need

3, there are some places such as json data returned is not strictly json format we'll get it out to use regular expressions to filter

4, sometimes requests module sends a request can not get what we want without page may need to use the browser interface drive to get the page content

result:

 

 

Guess you like

Origin www.cnblogs.com/Qiuzhiyu/p/12177691.html