Python web scraping

import json
import them
import requests
import bs4
from lxml import etree

# Simulate real browser headers
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}

# Get the number of album pages
def get_album(url):
    res = requests.get(url, headers=header)
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    # pagingBar_page is the page number identifier viewed in developer mode
    elems = soup.select('a[class^="pagingBar_page"]')
    print("There are {} pages in this channel".format(len(elems)-1))
    for i in elems:
        if i.text == "next page":
            continue
        print("Downloading the {}/{}th page".format(i.text, len(elems)-1))

        if i.text != "1":
            url = "http://www.ximalaya.com" + i.attrs["href"]

        get_url(url)

def get_url(url):
    res = requests.get(url, headers=header)
    soup = bs4.BeautifulSoup(res.text, "html.parser")
    # sound_id is the page number ID viewed in developer mode
    elems = soup.select('li[sound_id]')

    for i in range(len(elems)):
        murl = 'http://www.ximalaya.com/tracks/{}.json'.format(elems[i].attrs["sound_id"])
        html = requests.get(murl, headers=header).text
        dic = json.loads(html)
        try:
            print("Downloading the first {}/{} file, file name {}:{}.".format(i+1, len(elems),
                                                     elems[i].attrs["sound_id"],dic["title"]))
            get_m4a(dic["play_path"], elems[i].attrs["sound_id"])
        except:
            print("Failed to download {}/{} file, filename {}:{}.".format(i + 1, len(elems),
                                                     elems[i].attrs["sound_id"], dic["title"]))


def get_m4a(url, id):
    folder = "Guo Degang cross talk" # custom folder name
    res = requests.get(url)
    file = open(os.path.join(folder, os.path.basename(id)), 'wb')
    for chunk in res.iter_content(100000):
        file.write(chunk)
    file.close()


if __name__ == '__main__':
    url = "http://www.ximalaya.com/1000202/album/2667276/" # Album address
    get_album(url)

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326062275&siteId=291194637