. 1 # ! / Usr / bin / Python 2 # - * - Coding: UTF-. 8 - * - . 3 # @time: 2020/3/25 19:09 . 4 # @author: Meng . 5 # @file: paxmly.py . 6 # @Software: PyCharm . 7 . 8 Import requests # analog browser page information page for crawling . 9 Import parsel # packet parsing page 10 headers = { # disguised browser request header data format is a dictionary . 11 ' User-Agent ' : 'Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 80.0.3987.149 Safari / 537.36 ' 12 is } 13 is Response = requests.get (URL = ' HTTPS: //www.ximalaya. COM / youshengshu / 34,013,148 / P2 / ' , headers = headers) # crawling pages specified playlist 14 SEL = parsel.Selector (response.text) # parsing web 15 sound_list = sel.css ( ' .sound list UL-Li a ' ) # Get a page 30 linked 16 list_all = [] # stored list of all results . 17 for Sound in sound_list [0:30]: # slice 18 is # Print (Sound) . 19 MEDIA_URL sound.css = ( ' A :: attr (the href) ' ) .extract_first () # remove link 20 is # Print (MEDIA_URL) 21 is MEDIA_URL media_url.split = ( ' / ' ) [-. 1 ] # remove the audio ID 22 is # Print (MEDIA_URL) 23 is Media_name sound.css = ( " a :: attr (title) " ) .extract_first () # removed name audio 24 # Print (Media_name) 25 list_all.append ((MEDIA_URL , Media_name)) # append to the list 26 # Print (list_all) 27 # acquired real song encapsulated link def defined function 28 def media_api (track_ID): 29 api_url = F ' https://www.ximalaya.com/revision/play/v1/audio?id= } & PTYPE track_ID = {. 1 ' ; 30 Response = requests.get (api_url, headers = headers) 31 is # Print (response.json ()) 32 # JSON return extracted using a dictionary [] 33 is data_json = response.json () 34 is = data_json the src [ ' Data ' ] [ ' the src ' ] 35 return the src 36 # downloader to download MP4 37 [ DEF download_meida (MEDIA_URL, Media_name): 38 is Response = requests.get (MEDIA_URL, headers = headers); 39 with Open (F ' {} .mp4 Media_name ' , MODE = ' WB ' ) AS F: # download the file as the file name 40 f.write (response.content) 41 is IF the __name__ == ' __main__ ' : 42 is for URL in list_all: 43 is Print (URL [. 1 ]) 44 is = media_api true_url (URL [0]) # representative of our audio id, a unique identifier 45 name URL = [. 1 ] 46 is download_meida (true_url, name)
1 #能发送http请求的库 2 import requests 3 import parsel 4 headers = { 5 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36' 6 } 7 8 def download_meida(media_url,media_name): 9 response = requests.get(media_url,headers = headers); 10 with open(f'{media_name}.mp4',mode='wb') as f: 11 f.write(response.content) 12 13 def media_api(track_id): 14 api_url =f'https://www.ximalaya.com/revision/play/v1/audio?id={track_id}&ptype=1'; 15 response = requests.get(api_url,headers = headers) 16 #print(response.json()) 17 #json返回字典类型 提取使用[] 18 data_json = response.json() 19 src = data_json['data']['src'] 20 return src 21 is 22 is # Print (media_api (98,791,745)) 23 is 24 DEF get_total_page (page_url): 25 # request page 26 is Response = requests.get (page_url, headers = headers) 27 # acquired content html page 28 SEL = parsel.Selector (Response .text) 29 # to find a label by css selector .sound-list class attribute as a sound-list and in the following li UL 30 sound_list = sel.css ( ' .sound-list a UL li ' ) 31 # only the first 30 pages link is intercepted before 30 32 for Sound in sound_list [: 30 ]: 33 is # extract_first () will be extracted from the object text 34 # Get href attribute of a tag of the contents 35 MEDIA_URL sound.css = ( ' a :: attr (href) ' ) .extract_first () 36 # / youshengshu / 16,411,402 / 98791745-- only to the rearmost ID 37 [ MEDIA_URL media_url.split = ( ' / ' ) [-. 1 ] 38 is # Gets the title attribute of the contents of a tag 39 Media_name sound.css = ( ' a :: attr (title) ' ) .extract_first () 40 # with the contents of an entire cycle yield return 41 is yield MEDIA_URL, Media_name 42 is 43 is IF the __name__ == ' __main__ ' : 44 is # circulation pages download pages downloaded range representative of the range 45 for Page in range (l, 3 ): 46 is meidas get_total_page = (F ' https://www.ximalaya.com/yinyue / 23.84118 million / Page {P} ' ) 47 for media_id, Media_name in meidas: 48 # Print (MEDIA_URL, Media_name) 49 MEDIA_URL = media_api (media_id) 50 Print (MEDIA_URL) 51 is download_meida (MEDIA_URL, Media_name)