Python爬虫--喜马拉雅音频爬取

爬取喜马拉雅三国中的前十章音频:

#导入requests模块
import requests
#导入正则表达式
import re
#解决反爬问题,导入UA
header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
#网页源代码中获取的前十章ID
sound_ids = ( 64686514, 64689648, 64695831, 64695832, 3218935, 3822581, 3419626, 3513844, 3593277, 3773655)
for s in range( 0, 10):
for i in sound_ids:
# 每个音频的URL
url = 'http://www.ximalaya.com/tracks/'+ str(sound_ids[s])+ '.json'
#网页源代码
html = requests.get(url, headers=header)
#打印网页源代码
# print(html.text)
def get_find_url():
#正则匹配ID和对应的URL
reg = '"id":(.*?),"play_path_64":"(.*?)"'
#最终的音频URL数列
sound_url = re.findall(reg,html.text)
#打印音频URL数列
# print(sound_url)
return sound_url

#ID和音频URL单独取出来
for id,url_finall in get_find_url():
#打印最终音频URL
#print('第',s+1,'节:',url1)
#获取音频详细内容
m4a = requests.get(url_finall)
#取音频最后4位数,即就是.m4a作为后缀名
m4a_name = url_finall[- 4:]
print( '<正在下载第',s+ 1, '节> ',url_finall)
#音频内容存储到本地
with open( '第'+ str(s+ 1)+ '节'+m4a_name, 'wb') as f:
f.write(m4a.content)

猜你喜欢

转载自blog.csdn.net/Botree_chan/article/details/79513444