Part get to the download url of each episode, to use this to get the url to download video
Want to download a file can prompt class progress bars, in a wave of online search and found a library can be achieved: tqdm library, specific reference to the use of this blog: https: //www.jianshu.com/p / 1ed2a8b2c77b
In the original class below, together with a method for downloading files, the following
DEF download_file (URL, name): "" " download file " "" the try : Response = requests.get (URL = URL, Stream = True) content_size = int (Response.Headers [ ' the Content-the Length ' ]) / 1024 # file size is acquired from the response header IF content_size: with Open (name, " WB " ) AS F: Print ( " Total: " , content_size, ' K ' ) for Data intqdm (Iterable = response.iter_content (1024), Total = content_size, Unit = ' K ' ): # Implementation progress bar f.write (Data) # f.close () Print ( " \ n-DONE " + name) the except aS E requestexception: Print ( " interface error information R & lt% " , E)
Plus a main function based on the original
DEF main (Self): "" " The main function " "" ROOT_DIR = os.path.abspath with (the os.path.join (os.path.dirname ( __FILE__ ), os.pardir)) Print (ROOT_DIR) DOWNLOAD_URL = Self. get_tv_url ( " blood of infected season " ) # call get_tv_url (), Get Episode url for T in DOWNLOAD_URL: name = t.split ( ' / ' ) - [. 1] # a text link for downloading the url / after , used as a file name file_path = ROOT_DIR + " / Movies / " # Set file path if Not os.path.exists (file_path): os.makdirs (file_path) Print ( " Downloading [] {} " .format (name)) self.download_file (T, file_path + name) # call download_file (), based on the download link to start downloading the file
Finally, the complete code is as follows
1 # coding: utf-8 2 """ 3 author: hmk 4 describe: 爬虫80s电影网 5 create_time: 2019/01/18 6 """ 7 8 import re 9 10 import os 11 from bs4 import BeautifulSoup 12 from requests.exceptions import RequestException 13 import requests 14 from tqdm import tqdm 15 16 class DownloadTV: 17 @staticmethod 18 def get_html(url, data=None, header=None, method=None): 19 """获取一个url的html格式文本内容""" 20 21 if method == "get": 22 response = requests.get(url, params=data, headers=header) 23 else: 24 response = requests.post(url, data=data, headers=header) 25 try: 26 if response.status_code == 200: 27 response.encoding =response.apparent_encoding 28 # Print (response.status_code) 29 # Print (response.text) 30 return response.text 31 is return None 32 the except requestexception: 33 is Print ( " request failed " ) 34 is return None 35 36 DEF get_tv_id (Self, tv_name ): 37 "" " Get id corresponding to the queried drama " "" 38 is headers = { 39 , " the Content-the Type " : "file application / X-WWW-form-urlencoded " 40 } 41 is 42 is Data = { 43 is " search_typeid " : " . 1 " , 44 is " SKey " : tv_name, # using a variable to represent mean worder 45 " the Input " : " Search " 46 is } 47 48 URL = " http://www.y80s.com/movie/search/ " # request URL 49 50 = self.get_html Response (URL, Data, headers, " POST " ) 51 is 52 is HTML = Response 53 is # Print (HTML) 54 is 55 Soup = the BeautifulSoup (HTML, " html.parser " ) 56 is name_label soup.find_all = ( " A " , title = tv_name) # get all <a> tag title attribute is the name of the TV drama, with a dynamic variable to indicate the name of drama 57 # Print (soup.prettify ()) 58 # Print (name_label) 59 # Print (name_label [0] .get ( 'the href')) 60 61 is = the re.compile ju_id (R & lt ' (\ + D) ' , re.S) # define a regular expression, to extract digital content label 62 is IF name_label: 63 is href_value = ju_id.search (name_label [0] .get ( ' the href ' )) 64 IF href_value: 65 TV_ID = href_value.group () 66 Print ( " query television drama corresponding id is: {} " .format (TV_ID)) 67 # Print (type (TV_ID)) acquired View # TV_ID the data type, then if int in splicing requires subsequent str () translated into strings 68 return TV_ID 69 70 DEFget_tv_url (Self, tv_name): 71 is "" " Get drama download URL " "" 72 TV_ID = self.get_tv_id (tv_name) # call get_tv_id () method, obtaining TV_ID 73 is URL = " http://www.y80s.com / Ju / " + TV_ID # use TV_ID splicing URL 74 75 R & lt self.get_html = (URL, Method = " GET " ) 76 HTML = R & lt 77 Soup = the BeautifulSoup (HTML, " html.parser " ) 78 a_tv_url = soup.find_all ( 'a", Title = " Local Download " ) # extract title attribute is "local download" of a label, returns a list of all tags of a 79 # Print (a_tv_url) 80 tv_url = [] 81 for T in a_tv_url: 82 tv_url.append (t.get ( ' href ' )) # Get the value href attribute get method with a tab for each 83 Print (tv_url) 84 return tv_url 85 86 @staticmethod 87 DEF download_file (URL, name): 88 "" " Download file "" " 89 try: 90 response = requests.get(url=url, stream=True) 91 content_size = int(response.headers['Content-Length']) / 1024 # 文件大小,从响应头中获取 92 93 if content_size: 94 with open(name, "wb") as f: 95 print("total: ", content_size, 'k') 96 for data in tqdm(iterable=response.iter_content(1024), total=content_size, unit='k'): 97 f.write(data) 98 # f.close() 99 print("\n done " + name) 100 101 except RequestException as e: 102 print("接口错误信息为 %r", e) 103 104 def main(self): 105 """主函数""" 106 root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 107 print(root_dir) 108 DOWNLOAD_URL = self.get_tv_url ( " blood of infected Season " ) # call get_tv_url (), Get Episode URL 109 for T in DOWNLOAD_URL: 110 name = t.split ( ' / ' ) [-. 1] # get the download link in a text url / after as file names 111 file_path = ROOT_DIR + " / Movies / " # set file path 112 IF Not os.path.exists (file_path): 113 os.makdirs (file_path) 114 Print ( " is Download [] {} ".format (name)) 115 self.download_file (T, + file_path name) # call download_file (), according to the download link to start downloading the file 1 16 117 IF the __name__ == ' __main__ ' : 1 18 Test = DownloadTV () 119 test.main ( )
Running about