Python爬取喜马拉雅有声书

# 导入第三方库

import requests
import parsel

# 模拟浏览器
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}


def download_media(media_url, media_name):
  """定义下载模块"""
response = requests.get(media_url, headers=headers)

with open(f'{media_name}.mp4', mode="wb") as f:
f.write(response.content)


def media_api(track_id):
"""从网页获取下载地址"""
api_url = f"https://www.ximalaya.com/revision/play/v1/audio?id={track_id}&ptype=1"
resepons = requests.get(api_url, headers=headers)
data = resepons.json()
src = data["data"]["src"]
return src


def get_total_page(page_url):
"""请求每页模块"""
response = requests.get(page_url, headers=headers)
sel = parsel.Selector(response.text)
sound_list = sel.css(".sound-list ul li a")
for sound in sound_list[:30]:
media_url = sound.css("a::attr(href)").extract_first()
media_url = media_url.split("/")[-1]
media_name = sound.css("a::attr(title)").extract_first()
yield media_url, media_name


if __name__ == '__main__':
   """主程序"""
# print(media_api(152693861))
# 循环遍历多页
for page in range(1, 24):
medias = get_total_page(f"https://www.ximalaya.com/youshengshu/20642967/p{page}")
for media_id, media_name in medias:
media_url = media_api(media_id)
download_media(media_url, media_name)

猜你喜欢

转载自www.cnblogs.com/panda009079/p/12345085.html
今日推荐