python爬虫-喜马拉雅_晚安妈妈睡前故事

这里先说下思路:

1、首先要获取当前书的音频信息

        '''获取当前书的音频信息'''
        all_list = []
        for url in self.book_url:
            r = requests.get(url, headers=self.headers)
            ret = r.content.decode()
            # ret通过requests请求得到的网页源代码,是一个json数据类型
            pyhton_dict = json.loads(ret)  # 通过json.loads(ret)把ret这个json类型的字符串变成python的dict
            # print(pyhton_dict)
            pythonData = pyhton_dict['data']['tracksAudioPlay']
            # print(pythonData)
            for book in pythonData:
                # 取出每个音频的播放地址和名字
                list = {}
                list['src'] = book['src']
                list['name'] = book['trackName']
                print(list)
                all_list.append(list)
        return all_list  # 所有音频的信息,只是一个list

2、然后遍历保存

  for i in all_list:
            # 遍历每个音频,保存
            print(i)
            i['name'] = re.sub('"', '', i['name'])  # 如果有文件名称是"结尾,需要改成空
            with open('D:\\xima\\{}.m4a'.format(self.name + i['name']), 'ab') as f:  # wb会覆盖之前数据,ab不覆盖保存
                r = requests.get(i['src'], headers=self.headers)
                ret = r.content
                f.write(ret)
        print("下载完毕")

3、最后直接上代码啦!

import requests
from lxml import etree
import re
import json

class Xima(object):

    def __init__(self, name):
        self.name = name
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
        }
        self.start_url = "https://www.ximalaya.com/revision/play/album?albumId=260744&pageNum={}&sort=-1&pageSize=30"  # {} 占位
        self.book_url = []
        for i in range(30):
            url = self.start_url.format(i+1) # format格式插入
            self.book_url.append(url)
        print(self.book_url)
        print(len(self.book_url))

    def get_book_msg(self):
        '''获取当前书的音频信息'''
        all_list = []
        for url in self.book_url:
            r = requests.get(url, headers=self.headers)
            ret = r.content.decode()
            # ret通过requests请求得到的网页源代码,是一个json数据类型
            pyhton_dict = json.loads(ret)  # 通过json.loads(ret)把ret这个json类型的字符串变成python的dict
            # print(pyhton_dict)
            pythonData = pyhton_dict['data']['tracksAudioPlay']
            # print(pythonData)
            for book in pythonData:
                # 取出每个音频的播放地址和名字
                list = {}
                list['src'] = book['src']
                list['name'] = book['trackName']
                print(list)
                all_list.append(list)
        return all_list  # 所有音频的信息,只是一个list

    def save(self, all_list):
        print("开始下载")
        for i in all_list:
            # 遍历每个音频,保存
            print(i)
            i['name'] = re.sub('"', '', i['name'])  # 如果有文件名称是"结尾,需要改成空
            with open('D:\\xima\\{}.m4a'.format(self.name + i['name']), 'ab') as f:  # wb会覆盖之前数据,ab不覆盖保存
                r = requests.get(i['src'], headers=self.headers)
                ret = r.content
                f.write(ret)
        print("下载完毕")


    def run(self):
        all_list = self.get_book_msg()
        self.save(all_list)

if __name__ == "__main__":
    xima = Xima('晚安妈妈睡前故事')
    xima.run()

猜你喜欢

转载自www.cnblogs.com/lixy-88428977/p/9366913.html