Crawl NetEase Cloud Music (including lyrics and comments)

# http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=35
# http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=70

from urllib import request,parse
import pymysql
from bs4 import BeautifulSoup
import re
import random
from selenium import webdriver
import time
import json
import requests


def singer():
    base_url = 'http://music.163.com/discover/artist/cat?id=1001'

    user_agent = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",

    ]
    ua = random.choice(user_agent)

    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        # Accept-Encoding:gzip, deflate, sdch,
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': '_ntes_nnid=9590bff1d254dd7fa3f25d8e0d311522,1510122577046; _ntes_nuid=9590bff1d254dd7fa3f25d8e0d311522; _ngd_tid=dGE34taznjKAcDgyTJtO2J7d%2Bg%2BOLdSZ; usertrack=ezq0pVqnqQJAj7faBRRpAg==; __f_=1521533778639; starttime=; NTES_SESS=gWckWp0kS7P8.9Ll7xa1C2UB_4KpPrwEUUnWVSgGwgUuP_OdPs6CJlnou4rhyTwLPIJVjVNPLy3yX18e7HET2ih10YmTlRyC72K7.chKLFGewToNDdDACY3ojifgYw5TipjDIF7JEcSBMG6jhgsdk4TJayFaVg0m3mSciKsZf0JgHiZjNW9Urz_X2s8tcyGw9.DPBx6s5eROyccVAQqxWLj.v33_K253y; S_INFO=1523348346|0|3&80##|m13349949963_1; [email protected]|1523348346|0|mail163|00&99|null&null&null#jix&360100#10#0#0|133963&1||[email protected]; [email protected]:-1:1; df=mail163_letter; Province=0790; City=0791;MUSIC_EMAIL_U=8e19f5c8cbc11303a2d71c0d3532255599dcb9d14e06692f6202bc9e762f9363b95b9657afa5636f8b99162d656ec71eead030040e2add0d4bcf6e8189568a96; playliststatus=visible; JSESSIONID-WYYY=dwTSf7No9xGH7HzrhqYcwnPQIVAnwgM6Pq%2FO%5ClmDiH2l5ScrkuvMSG%2BYZutH6wAz9WPwmNoo2evEm9Ee%2B%2Fa3%5Cx%5C%2B%2FUoToq37TQd%2BkzRzkSimgZlpbqnQXVP%5Cdu86phA4Se0w%2FQpgg15A8%2FjES3ahRByglGzMjzuiSSE8DRk%2B9ojksu33%3A1523358787887; _iuqxldmzr_=32; __utma=94650624.199382336.1520261898.1523349418.1523355839.5; __utmb=94650624.21.10.1523355839; __utmc=94650624; __utmz=94650624.1523349418.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',JSESSIONID-WYYY=dwTSf7No9xGH7HzrhqYcwnPQIVAnwgM6Pq%2FO%5ClmDiH2l5ScrkuvMSG%2BYZutH6wAz9WPwmNoo2evEm9Ee%2B%2Fa3%5Cx%5C%2B%2FUoToq37TQd%2BkzRzkSimgZlpbqnQXVP%5Cdu86phA4Se0w%2FQpgg15A8%2FjES3ahRByglGzMjzuiSSE8DRk%2B9ojksu33%3A1523358787887; _iuqxldmzr_=32; __utma=94650624.199382336.1520261898.1523349418.1523355839.5; __utmb=94650624.21.10.1523355839; __utmc=94650624; __utmz=94650624.1523349418.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',JSESSIONID-WYYY=dwTSf7No9xGH7HzrhqYcwnPQIVAnwgM6Pq%2FO%5ClmDiH2l5ScrkuvMSG%2BYZutH6wAz9WPwmNoo2evEm9Ee%2B%2Fa3%5Cx%5C%2B%2FUoToq37TQd%2BkzRzkSimgZlpbqnQXVP%5Cdu86phA4Se0w%2FQpgg15A8%2FjES3ahRByglGzMjzuiSSE8DRk%2B9ojksu33%3A1523358787887; _iuqxldmzr_=32; __utma=94650624.199382336.1520261898.1523349418.1523355839.5; __utmb=94650624.21.10.1523355839; __utmc=94650624; __utmz=94650624.1523349418.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
        'Host': 'music.163.com',
        'Referer': 'http://music.163.com/',
        'Upgrade-Insecure-Requests': 1,
        'User-Agent': ua
    }
    req = request.Request(base_url, headers=headers)
    response = request.urlopen(req)
    html = response.read()
    html = html.decode('utf-8')
    soup = BeautifulSoup(html, 'lxml')
    singers= soup.select(".m-sgerlist")[0]
    lis = singers.find_all(['li'])
    #take the singer's name
    for li in lis:
        #singer name
        singer = li.select('a[class="nm nm-icn f-thide s-fc0"]')[0].text
        href = li.select('a[class="nm nm-icn f-thide s-fc0"]')[0].attrs['href']
        # print(singer,type(singer))
        pattern = re.compile('[0-9]+')
        id = pattern.findall(href)[0]
        num = int(id)
        # print(id,type(id))
        conn = pymysql.connect('127.0.0.1','root','zb376100870','163_song',charset='utf8')
        cursor = conn.cursor()
        sql = "insert into singer_tb(sger_name,sger_num) VALUE (%s,%s)"
        data = (singer,num)
        cursor.execute(sql,data)
        conn.commit()

        #get album
        album(id,headers)

def album(id,headers):
    # if id == '6452':
    sger_num = int (id)
    for i in range(10):
        j = i*12
        sger_url = 'http://music.163.com/artist/album?id=%d&limit=12&offset=%d'%(sger_num,j)
        req = request.Request(sger_url,headers=headers)
        response = request.urlopen(req)
        html = response.read()
        html = html.decode('utf-8')
        soup = BeautifulSoup(html,'lxml')
        albums = soup.select('#m-song-module')
        # print('123456'+html)
        if albums != []:
            lis = albums[0].find_all(['li'])
            # print(lis)
            for li in lis:

                album_name = li.select('a[class="tit s-fc0"]')[0].text
                href = li.select('a[class="tit s-fc0"]')[0].attrs['href']
                pattern = re.compile('[0-9]+')
                album_id = pattern.findall(href)[0]
                album_num = int(album_id)
                # print(album_name)
                conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8')
                cursor = conn.cursor()
                sql = "insert into album_tb(album_name,album_num,sger_num) VALUE (%s,%s,%s)"
                data = (album_name, album_num,sger_num)
                cursor.execute(sql, data)
                conn.commit()

                song(album_num,headers,sger_num)

def song(album_num,headers,sger_num):
    # print(album_num,type(album_num))
    song_url = 'http://music.163.com/album?id=%d'%(album_num)
    req = request.Request(song_url, headers=headers)
    response = request.urlopen(req)
    html = response.read()
    html = html.decode('utf-8')
    soup = BeautifulSoup(html, 'lxml')
    # albums = soup.select('#m-song-module')
    # print(html)
    album_song = soup.select('ul[class="f-hide"]')[0]
    songs = album_song.find_all(['a'])
    for song in songs:
        song_name = song.text
        href = song.attrs['href']
        pattern = re.compile('[0-9]+')
        song_id = pattern.findall(href)[0]
        song_num = int(song_id)

        print(song_name,song_num,album_num,sger_num)
        # conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8')
        # cursor = conn.cursor()
        # sql = "insert into song_tb(song_name,song_num,album_num,sger_num) VALUE (%s,%s,%s,%s)"
        # data = (song_name, song_num,album_num, sger_num)
        # cursor.execute(sql, data)
        # conn.commit()

        word_cmts(song_name, song_num,album_num, sger_num,headers)
        # word_cmts(song_num,headers)

def word_cmts(song_name, song_num,album_num, sger_num,headers):
    print(song_num)
    word_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(song_num) + '&lv=1&kv=1&tv=-1'
    # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=' + str(song_num) + '&lv=1&kv=1&tv=-1'
    # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=531051217&lv=1&kv=1&tv=-1'

    req = request.Request(word_url, headers=headers)
    response = request.urlopen(req)

    html = response.read()
    html = html.decode('utf-8')
    j = json.loads(html)
    print('~~~~~~~~~~~',j)
    if 'lrc' in j:
        lrc = j['lrc']['lyric']
    # print(lrc)
        pat = re.compile(r'\[.*\]')
        lrc = re.sub(pat,"",lrc)
        lrc = lrc.strip()
    else:
        lrc = ''
    # print(lrc,type(lrc))


    cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_' + str(song_num) + '?id=' + str(song_num) + '&lv=1&kv=1&tv=-1'
    # cmts_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_185807?id=531051217&lv=1&kv=1&tv=-1'


    req = request.Request(cmts_url,headers=headers)
    response = request.urlopen(req)
    html = response.read()
    html = html.decode('utf-8')
    c = json.loads(html)
    cmts = c['hotComments']
    lst = []
    for cmt in cmts:
        comment = cmt['content']
        comment = comment.strip()
        # print(comment)
        # patttern = re.compile(r'[a-zA-Z0-9\w+]+')
        # comment = re.sub(patttern,'',comment)
        # comment = comment.strip()
        # print(comment)

        lst.append(comment)
    cmmt = str(lst)
    # print(cmmt,type(cmmt))

    try:
        # How python UCS-4 build is handled
        highpoints = re.compile(u'[\U00010000-\U0010ffff]')
    except re.error:
        # How python UCS-2 build is handled
        highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')

    cmmt = highpoints.sub(u'??', cmmt)
    # print(cmmt)

    conn = pymysql.connect('127.0.0.1', 'root', 'zb376100870', '163_song', charset='utf8')
    cursor = conn.cursor()
    sql = "insert into song_tb(song_name,song_num,album_num,sger_num,song_lrc,song_cmts) VALUE (%s,%s,%s,%s,%s,%s)"
    data = (song_name, song_num, album_num, sger_num,lrc,cmmt)
    cursor.execute(sql, data)
    conn.commit()

if __name__ == '__main__':

    singer()

  

Field Description

Singer_tb is a singer table, including the following fields:

Sger_id: for counting and sorting

Sger_name: singer name

Sger_num: The number corresponding to the singer's name, which can be used for multi-table joint query

Number of singers: 6

Album_tb: is the album table, including the fields:

Albun_id: for counting and sorting

Album_name: Album name

Album_num: album number, which can be used for multi-table joint query with the song_tb song table

Sger_num: The number corresponding to the singer's name, which can be used for multi-table joint query with the singer_tb singer table,

Number of albums: 223

Song_tb: is the song table, including the fields:

Song_id: for counting and sorting

Song_name: Song name

Song_num: Song number

Album_num: album number, which can be used for multi-table joint query with the song_tb song table

Sger_num: The number corresponding to the singer's name, which can be used for multi-table joint query with the singer_tb singer table,

Song_lrc: Lyrics

Song_cmts: Song reviews

Number of songs: 2286

Simple description of multi-table join query

The correspondence between each table and the table has been matched, which can be realized by multi-table joint query (the specific operation can be realized in the database),

Looking at Jay Chou's profile, you can see all Jay Chou's songs and albums, which can be searched with multiple tables.

For example: enter select song_name,sger_name from song_tb,singer_tb in the database terminal

Where song_tb.sger_num = singer_tb.sger_num; is just an example of a multi-table join query

 

Problems encountered:

  1. The interface problem of lyrics, because the js encryption method is more complicated, what I found is the api interface, which can directly get the lyrics
  2. The problem of inserting comments into the database is that the emoji expression cannot be inserted, and the expression of the comment needs to be removed.

The reason is the encoding of the database. The default encoding of the database is utf8. Generally, one character occupies three bytes, but the emoji expression occupies 4 bytes, which leads to the situation that cannot be inserted. The regular method is used to remove the emoji expression.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324645026&siteId=291194637