xmly reptiles Case

. 1  # ! / Usr / bin / Python 
2  # - * - Coding: UTF-. 8 - * - 
. 3  # @time: 2020/3/25 19:09 
. 4  # @author: Meng 
. 5  # @file: paxmly.py 
. 6  # @Software: PyCharm 
. 7  
. 8  Import requests   # analog browser page information page for crawling 
. 9  Import parsel # packet parsing page 
10 headers = { # disguised browser request header data format is a dictionary 
. 11  ' User-Agent ' : 'Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 80.0.3987.149 Safari / 537.36 ' 
12 is  }
 13 is Response = requests.get (URL = ' HTTPS: //www.ximalaya. COM / youshengshu / 34,013,148 / P2 / ' , headers = headers) # crawling pages specified playlist 
14 SEL = parsel.Selector (response.text)   # parsing web 
15 sound_list = sel.css ( ' .sound list UL-Li a ' ) # Get a page 30 linked 
16 list_all = []   # stored list of all results 
. 17  for Sound in sound_list [0:30]: # slice
18 is      # Print (Sound) 
. 19      MEDIA_URL sound.css = ( ' A :: attr (the href) ' ) .extract_first () # remove link 
20 is      # Print (MEDIA_URL) 
21 is      MEDIA_URL media_url.split = ( ' / ' ) [-. 1 ]   # remove the audio ID 
22 is      # Print (MEDIA_URL) 
23 is      Media_name sound.css = ( " a :: attr (title) " ) .extract_first ()   # removed name audio 
24      # Print (Media_name) 
25      list_all.append ((MEDIA_URL , Media_name)) # append to the list 
26 # Print (list_all) 
27  # acquired real song encapsulated link def defined function 
28  def media_api (track_ID):
 29      api_url = F ' https://www.ximalaya.com/revision/play/v1/audio?id= } & PTYPE track_ID = {. 1 ' ;
 30      Response = requests.get (api_url, headers = headers)
 31 is      # Print (response.json ()) 
32      # JSON return extracted using a dictionary [] 
33 is      data_json = response.json ()
 34 is      = data_json the src [ ' Data ' ] [ ' the src ' ]
 35      return the src
36  # downloader to download MP4 
37 [  DEF download_meida (MEDIA_URL, Media_name):
 38 is      Response = requests.get (MEDIA_URL, headers = headers);
 39      with Open (F ' {} .mp4 Media_name ' , MODE = ' WB ' ) AS F:   # download the file as the file name 
40          f.write (response.content)
 41 is  IF  the __name__ == ' __main__ ' :
 42 is      for URL in list_all:
 43 is          Print (URL [. 1 ])
 44 is         = media_api true_url (URL [0])   # representative of our audio id, a unique identifier 
45          name URL = [. 1 ]
 46 is          download_meida (true_url, name)

 

 1 #能发送http请求的库
 2 import requests
 3 import parsel
 4 headers = {
 5 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
 6 }
 7 
 8 def download_meida(media_url,media_name):
 9     response = requests.get(media_url,headers = headers);
10     with open(f'{media_name}.mp4',mode='wb') as f:
11         f.write(response.content)
12 
13 def media_api(track_id):
14     api_url =f'https://www.ximalaya.com/revision/play/v1/audio?id={track_id}&ptype=1';
15     response = requests.get(api_url,headers = headers)
16     #print(response.json())
17     #json返回字典类型  提取使用[]
18     data_json = response.json()
19     src = data_json['data']['src']
20     return src
21 is  
22 is  # Print (media_api (98,791,745)) 
23 is  
24  DEF get_total_page (page_url):
 25      # request page 
26 is      Response = requests.get (page_url, headers = headers)
 27      # acquired content html page 
28      SEL = parsel.Selector (Response .text)
 29      # to find a label by css selector .sound-list class attribute as a sound-list and in the following li UL 
30      sound_list = sel.css ( ' .sound-list a UL li ' )
 31      # only the first 30 pages link is intercepted before 30 
32      for Sound in sound_list [: 30 ]:
33 is          # extract_first () will be extracted from the object text 
34          # Get href attribute of a tag of the contents 
35          MEDIA_URL sound.css = ( ' a :: attr (href) ' ) .extract_first ()
 36          # / youshengshu / 16,411,402 / 98791745-- only to the rearmost ID 
37 [          MEDIA_URL media_url.split = ( ' / ' ) [-. 1 ]
 38 is          # Gets the title attribute of the contents of a tag 
39          Media_name sound.css = ( ' a :: attr (title) ' ) .extract_first ()
 40          # with the contents of an entire cycle yield return 
41 is          yield MEDIA_URL, Media_name
 42 is  
43 is IF  the __name__ == ' __main__ ' :
 44 is      # circulation pages download pages downloaded range representative of the range 
45      for Page in range (l, 3 ):
 46 is          meidas get_total_page = (F ' https://www.ximalaya.com/yinyue / 23.84118 million / Page {P} ' )
 47          for media_id, Media_name in meidas:
 48              # Print (MEDIA_URL, Media_name) 
49              MEDIA_URL = media_api (media_id)
 50              Print (MEDIA_URL)
 51 is              download_meida (MEDIA_URL, Media_name)

 

Guess you like

Origin www.cnblogs.com/987m/p/12591417.html