Crawl NetEase Cloud Music Reviews and use word cloud display

  Recently I heard a song I liked very much, Xu Wei's "I Think", the comments are also very interesting, so I want to crawl all the comments under the song and display it with the word cloud tool.

  We used chrome developer tools and found that the song's comments are hidden in XHR files starting with R_SO_4

Next, the idea is very clear, get the file, parse the json of the file, and get all the comments.

We can see that the file has two encrypted parameters params and encSecKey using JS. For these two encrypted parameters, please refer to the answer of Zhihu users: https://www.zhihu.com/question/36081767  .

 

step:

1. Import the necessary modules:

from Crypto.Cipher import AES
 from wordcloud import WordCloud #The
 following two sentences must be added, otherwise an error will be reported: matplotlib: RuntimeError: Python is not installed as a framework 
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import base64
import requests
import json
import codecs
import time
import jieba

Note: I use MacOS, I will report an error in this environment, add:

import matplotlib
matplotlib.use('TkAgg')

 

2. Write the request header:

headers = {
    'Host':'music.163.com',
    'Origin':'https://music.163.com',
    'Referer':'https://music.163.com/song?id=28793052',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}

 

 

3. Parse the two parameters params and encSecKey:

#First parameter # 
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}' 
#The second parameter 
second_param = " 010001 " 
# The third parameter 
third_param = " 00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7 " 
# fourth parameter 
forth_param = " 0CoJUm6Qyw8W8jud "

#Get parameters 
def get_params(page): # page is the incoming page number 
    iv = " 0102030405060708 " 
    first_key = forth_param
    second_key = 16 * 'F'
    if(page == 1): # 如果为第一页
        first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
        h_encText = AES_encrypt(first_param, first_key, iv)
    else:
        offset = str((page-1)*20)
        first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
        h_encText = AES_encrypt(first_param, first_key, iv)
    h_encText = AES_encrypt(h_encText, second_key, iv)
    return h_encText

#Get encSecKey 
def get_encSecKey():
    encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
    return encSecKey


#Decryption process 
def AES_encrypt(text, key, iv):
    pad = 16 - len(text) % 16
    text = text + pad * chr(pad)
    encryptor = AES.new(key, AES.MODE_CBC, iv)
    encrypt_text = encryptor.encrypt(text)
    encrypt_text = base64.b64encode(encrypt_text)
    encrypt_text = str(encrypt_text, encoding= " utf-8 " ) #Be sure to add this sentence, if there is no such sentence, there will be an error 
    return encrypt_text

 

4. Get the json data and grab the comments:

#Get comment json data 
def get_json(url, params, encSecKey):
    data = {
         "params": params,
         "encSecKey": encSecKey
    }
    response = requests.post(url, headers=headers, data=data)
    return response.content

#Grab the first 100 comments of a song 
def get_all_comments(url,page):
    all_comments_list = [] # 存放所有评论
    for i in range(page):  # 逐页抓取
        params = get_params(i+1)
        encSecKey = get_encSecKey()
        json_text = get_json(url,params,encSecKey)
        json_dict = json.loads(json_text)
        for item in json_dict['comments']:
            comment = item['content'] # 评论内容
            comment_info = str(comment)
            all_comments_list.append(comment_info)
        print('第%d页抓取完毕!' % (i+1))
        #time.sleep(random.choice(range(1,3)))  #爬取过快的话,设置休眠时间,跑慢点,减轻服务器负担
    return all_comments_list

 

5.使用结巴分词过滤停用词并用 wordcloud 生成词云:

#生成词云
def wordcloud(all_comments):
    # 对句子进行分词,加载停用词
    # 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。
    def seg_sentence(sentence):
        sentence_seged = jieba.cut(sentence.strip(), cut_all=False)  # 精确模式
        stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]  # 这里加载停用词的路径
        outstr = ''
        for word in sentence_seged:
            if word not in stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr
    for line in all_comments:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
        with open('outputs.txt', 'a', encoding='utf-8') as f:
            f.write(line_seg + '\n')

    data = open('outputs.txt', 'r', encoding='utf-8').read()
    my_wordcloud = WordCloud(
        background_color='white',  #设置背景颜色
        max_words=200,  #设置最大实现的字数
        font_path=r'SimHei.ttf',  #设置字体格式,如不设置显示不了中文
    ).generate(data)
    plt.figure()
    plt.imshow(my_wordcloud)
    plt.axis('off')
    plt.show()  # 展示词云

注意编码格式为 'utf-8' 。

 

6.定义主函数并设置函数出口:

def main():
    start_time = time.time()  # 开始时间
    url = "https://music.163.com/weapi/v1/resource/comments/R_SO_4_28793052?csrf_token="  # 替换为你想下载的歌曲R_SO的链接
    all_comments = get_all_comments(url, page=2000)  # 需要爬取的页面数
    wordcloud(all_comments)
    end_time = time.time()  # 结束时间
    print('程序耗时%f秒.' % (end_time - start_time))

if __name__ == '__main__':
    main()

 

 

运行过程如下(个人爬取了《我以为》的前2000页的评论):

 

生成词云:

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324612138&siteId=291194637