python爬虫80行代码拿下喜马拉雅有声书

声明：本文仅作为学习爱好者编写，请勿商业和恶意攻击源网站，本文所有解释权归作者
本文没有使用爬虫框架，仅用了三个Python的常用库
本文适合新手参考，文章里面有大量注释为理解提供便利

# 爬喜马拉雅
import requests
from lxml import etree
import os
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
    'Referer':'https://www.ximalaya.com/'
}
# 设置url 我们先搞第一页，注意默认p1是省略的               总歌单      单歌曲
# https://www.ximalaya.com/youshengshu/14968275/83332135
# 如果要爬取别的区块的内容，则修改下面的url就可以了
url = "https://www.ximalaya.com/youshengshu/p1/"
# 列表页函数
def listing(url):
    #直接请求
    response = requests.get(url=url, headers=headers).text
    # 生成解析对象
    etrees = etree.HTML(response)
    # 解析页面 获取当前页的歌曲地址
    page_listing_url = etrees.xpath('//div[@class="sound-list _yo5_"]/ul/li/div[@class="text _yo5_"]/a/@href')
    # 获取歌曲名称
    page_listing_name = etrees.xpath('//div[@id="anchor_sound_list"]/div/ul/li/div[@class="text _yo5_"]/a/span/text()')
    # 获取总的名字
    all_name = etrees.xpath('//div[@class="info _J460"]/h1/text()')[0]
    # 遍历歌曲url不是传到详情页 而是传到外链页   https://link.hhtjim.com/ximalaya/203837977.mp3
    file_path = "./%s/"%all_name
    if not os.path.exists(file_path):
        os.mkdir(file_path)
        for i,j in enumerate(page_listing_url):
            for k,v in enumerate(page_listing_name):
                if i == k:
                    # 分割i ,拿出歌曲id
                    song_id = j.split("/")[-1]
                    # 拼接路由
                    url = "https://link.hhtjim.com/ximalaya/" + song_id + ".mp3"
                    # 传给外链页url,返回歌曲内容
                    content = linking(url)
                    with open(file_path + v + ".mp3","wb") as f:
                        f.write(content)
                    print("%s，%s下载成功"%(all_name,v))
    # 获取下一页的列表，有下一页则递归
    next_url = etrees.xpath('//div[@class="pagination _yo5_"]/nav/ul/li[@class="page-next page-item _dN2"]/a/@href')
    if next_url:
        # 拼接路由
        url = "https://www.ximalaya.com" + next_url[0]
        # 递归调用
        listing(url)
    else:
        pass
# 外链页函数
def linking(url):
    response = requests.get(url=url,headers=headers).content
    return response
# 分类首页函数
def category_index(url):
    # 发送请求
    response = requests.get(url=url,headers=headers).text
    # 生成解析对象
    etrees = etree.HTML(response)
    # 解析页面 获取当前页的歌单地址
    # page_url_list = etrees.xpath('//div[@class="content"]/ul/li/div/a[@class="album-title line-1 lg bold _qie"]/@href')
    page_url_list = etrees.xpath('//div[@class="content"]/ul/li/div/a[@class="album-title line-1 lg bold _qie"]/@href')
    print(page_url_list)
    # page_name_list = etrees.xpath('//div[@class="content"]/ul/li/div/a/span/text()')
    # 获取之后遍历，直接把url传到列表页
    for i in page_url_list:
        # 拼接url传值    /youshengshu/25407248/   https://www.ximalaya.com/youshengshu/25407248/
        url = "https://www.ximalaya.com" + i
        listing(url)
    # 判断是否还有下一页   /youshengshu/p2/
    next_url = etrees.xpath('//div[@class="pagination-wrap"]/nav/ul/li[@class="page-next page-item _dN2"]/a/@href')
    if next_url:
        # 拼接路由
        url = "https://www.ximalaya.com" + next_url[0]
        # 递归调用
        category_index(url)
    else:
        pass
# 调用分类首页函数
category_index(url)

Python 键盘上的舞者

发布了30 篇原创文章 · 获赞 5 · 访问量 3332

私信关注

python爬虫80行代码拿下喜马拉雅有声书

猜你喜欢