import json import them import requests import bs4 from lxml import etree # Simulate real browser headers header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } # Get the number of album pages def get_album(url): res = requests.get(url, headers=header) soup = bs4.BeautifulSoup(res.text, "html.parser") # pagingBar_page is the page number identifier viewed in developer mode elems = soup.select('a[class^="pagingBar_page"]') print("There are {} pages in this channel".format(len(elems)-1)) for i in elems: if i.text == "next page": continue print("Downloading the {}/{}th page".format(i.text, len(elems)-1)) if i.text != "1": url = "http://www.ximalaya.com" + i.attrs["href"] get_url(url) def get_url(url): res = requests.get(url, headers=header) soup = bs4.BeautifulSoup(res.text, "html.parser") # sound_id is the page number ID viewed in developer mode elems = soup.select('li[sound_id]') for i in range(len(elems)): murl = 'http://www.ximalaya.com/tracks/{}.json'.format(elems[i].attrs["sound_id"]) html = requests.get(murl, headers=header).text dic = json.loads(html) try: print("Downloading the first {}/{} file, file name {}:{}.".format(i+1, len(elems), elems[i].attrs["sound_id"],dic["title"])) get_m4a(dic["play_path"], elems[i].attrs["sound_id"]) except: print("Failed to download {}/{} file, filename {}:{}.".format(i + 1, len(elems), elems[i].attrs["sound_id"], dic["title"])) def get_m4a(url, id): folder = "Guo Degang cross talk" # custom folder name res = requests.get(url) file = open(os.path.join(folder, os.path.basename(id)), 'wb') for chunk in res.iter_content(100000): file.write(chunk) file.close() if __name__ == '__main__': url = "http://www.ximalaya.com/1000202/album/2667276/" # Album address get_album(url)