喜马拉雅音频爬取(仅供参考学习)

import requests
from lxml import etree
from urllib import request
import os

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
url = 'https://www.ximalaya.com/lishi/4164479/'
response = requests.get(url,headers=headers)
html_str = response.text

html_ele = etree.HTML(html_str)

href_list = html_ele.xpath('//ul[@class="dOi2"]/li/div[2]/a/@href')
if not os.path.exists('mjx'):
    os.mkdir('mjx')

for href in href_list:

    next_href = href.split('/')[-1]

    xiangqing_url = 'https://www.ximalaya.com/revision/play/tracks?trackIds=' + str(next_href)
    print(xiangqing_url)
    response = requests.get(xiangqing_url,headers=headers)

    json_dict = response.json()

    src_str = json_dict['data']['tracksForAudioPlay'][0]['src']
    trackName = json_dict['data']['tracksForAudioPlay'][0]['trackName']
    request.urlretrieve(src_str,'mjx/'+ trackName + '.m4a')



猜你喜欢

转载自blog.csdn.net/majiexiong/article/details/81949388