爬虫-网易云评论

爬虫-网易云评论

# post请求体    
params: HMtP7KwWWgctb71g3T8v7b5SzlO1qN5JDI6WC8AqPYoakAYrpw1hm99wsn0Hp6AfP1ZNlp494Z+4XGXKiYyEXYTSoHvYTVhYpgDxUuSBdgNcZE0IXkkoA5YUEnQf2ESWO3bmt09k2ogKLOoQNWxEnXRewB0Oy2lPEdo52CVVNkUTMMd/gVPq4Zhj4LUvyjDh
encSecKey: 83e7a7f8bf53186b5c224d2732d86fb41a6366b8fb3c61b7dd4e630f6c5199e5c98732ab6fef399a8b4d08ece5a338e132c7cbc4a86a7f2d8c768431b408671acac04d05010406784afad5c36a904a784478bbc5a1fb29e46df26dc49fea70e6015d1a5409dec5a2f1bc0c997ffc3642177034138d7c2b9c872b35b81e95da7d
    
# js文件中
var bLq2x = window.asrsea(JSON.stringify(i8a), bvc9T(["流泪", "强"]), bvc9T(TQ2x.md), bvc9T(["爱心", "女孩", "惊恐", "大笑"]));
            e8e.data = k8c.cy9p({
                params: bLq2x.encText,
                encSecKey: bLq2x.encSecKey
            })
    
# 进一步找
function() {
    function a(a) {
        var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
        for (d = 0; a > d; d += 1)
            e = Math.random() * b.length,
            e = Math.floor(e),
            c += b.charAt(e);
        return c
    }
    // AES加密
    function b(a, b) {
        var c = CryptoJS.enc.Utf8.parse(b)
          , d = CryptoJS.enc.Utf8.parse("0102030405060708")
          , e = CryptoJS.enc.Utf8.parse(a)
          , f = CryptoJS.AES.encrypt(e, c, {
            iv: d,
            mode: CryptoJS.mode.CBC
        });
        return f.toString()
    }
    // RSA加密
    function c(a, b, c) {
        var d, e;
        return setMaxDigits(131),  // n的十六进制位数
        d = new RSAKeyPair(b,"",c),  // d key
        e = encryptedString(d, a)  // e 为 a的加密结果
    }
    // 得到加密后的结果
    function d(d, e, f, g) {
        var h = {}
          , i = a(16);
        return h.encText = b(d, g),  # 第一次AES加密(msg,key)
        h.encText = b(h.encText, i),  # 第二次AES加密
        h.encSecKey = c(i, e, f),   # 第一次RSA加密
        h
    }
    
    function e(a, b, d, e) {
        var f = {};
        return f.encText = c(a + e, b, d),
        f
    }
    
d = "{"csrf_token":""}", e = "010001", f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7", g = "0CoJUm6Qyw8W8jud"

python 代码

'''爬取指定歌曲的评论信息

点入歌曲详情页面,通过以下链接取出评论
/weapi/v1/resource/comments/R_SO_4_254574?csrf_token= HTTP/1.1
'''
import base64
import random
from math import floor, ceil
from multiprocessing import Pool
import jieba
from Crypto.Cipher import AES
import codecs
import requests
from wordcloud import WordCloud
class CommentSpider(object):
    def __init__(self, song_name, song_id):
        self.song_name = song_name
        self.song_id = song_id
        self.headers = {'Host': 'music.163.com',
                        'Referer': 'http://music.163.com/',
                        'Upgrade-Insecure-Requests': '1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                      'Chrome/66.0.3359.181 Safari/537.36'
                        }


    def generate_random_string(self, length):
        '''从string字符串中随机取出length个字母'''
        string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        random_str = ''
        for i in range(length):
            random_str += string[floor(int(random.random() * len(string)))]
        return random_str

    def aes_encrypt(self,msg, key):
        vi = '0102030405060708'
        pad = lambda s: s + (16 - len(s) % 16) * chr(16 - len(s) % 16)
        msg = pad(msg)
        cipher = AES.new(key.encode('utf8'), AES.MODE_CBC, vi.encode('utf8'))
        encryptedbytes = cipher.encrypt(msg.encode('utf8'))
        encodestrs = base64.b64encode(encryptedbytes)
        enctext = encodestrs.decode('utf8')
        return enctext

    def rsa_encrypt(self, random_string, key, f):
        # 随机字符串逆序排序
        string = random_string[::-1]
        # 转成bytes类型
        text = bytes(string, 'utf-8')
        # RSA加密
        sec_key = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16)
        # 返回结果, x填充到256位
        return format(sec_key, 'x').zfill(256)

    def get_params(self, page):
        offset = (page - 1) * 20
        # msg = '{"offset"' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
        # 在d方法处打断点来找到d方法的三个参数
        msg = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' % (offset, 'false')
        # msg ='{"csrf_token":""}'
        key = '0CoJUm6Qyw8W8jud'
        f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a87' \
            '6aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9' \
            'd05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b' \
            '8e289dc6935b3ece0462db0a22b8e7'
        e = '010001'
        # 生成长度为16的随机字符串,aes加密用的key
        str_16 = self.generate_random_string(16)
        # 第一次AES加密
        first_aes = self.aes_encrypt(msg, key)
        # 第二次AES加密
        encText = self.aes_encrypt(first_aes, str_16)
        # RSA加密得到encSecKey
        encSecKey = self.rsa_encrypt(str_16, e, f)
        return encText, encSecKey

    def get_comment(self,song_data):  #[song_name,id,page]
        '''
        获取第page页的评论
        post请求url,先组参数
        '''
        comment_url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + self.song_id + '?csrf_token='
        params, encSecKey = self.get_params(song_data[2])
        res = requests.post(comment_url, data={'params': params, 'encSecKey': encSecKey}, headers=self.headers,
                            verify=False)
        # 总页数
        # page_count = ceil((res.json()['total']-15) % 20)
        if res.status_code == 200:
            print('正在爬取第%s页的内容'%song_data[2])
            comments = res.json()['comments']
            with open(song_data[0] + '.txt','a',encoding='utf-8') as f:
                for i in comments:
                    f.write(i['content']+'\n')
        else:
            print('爬取第%s页失败'%song_data[2])

    def make_wordcloud(self,file_name):
        with open('%s.txt'%file_name,'r',encoding='utf-8') as f:
            txt = f.read()
        # 进行结巴分词
        text = ''.join(jieba.cut(txt))
        # 定义词云
        wc = WordCloud(
            font_path="simhei.ttf",  # 这里的字体要电脑上有的 C:\Windows\Fonts
            width=1200,
            height=800,
            max_words=100,
            max_font_size=200,
            min_font_size=10
        )
        # 生成词云
        wc.generate(text)
        # 保存图片
        wc.to_file(file_name + '.png')

    def run(self):
        '''首先要拿到总共多少页,然后进程池一页一页的爬'''
        url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + self.song_id + '?csrf_token='
        params, encSecKey = self.get_params(1)
        res = requests.post(url, data={'params': params, 'encSecKey': encSecKey}, headers=self.headers,
                            verify=False)
        # 总页数
        page_count = ceil((res.json()['total']-15) % 20)
        song_data = [(self.song_name,self.song_id,i+1) for i in range(int(page_count))]
        # 构造进程池
        pool = Pool(processes=4)
        pool.map(self.get_comment,song_data)

        # 所有的数据写入文件完成后生成词云
        self.make_wordcloud(self.song_name)


if __name__ == '__main__':
    # song_name = input('enter song name: ').strip()
    cs = CommentSpider('太多', '1339315554')
    cs.run()

猜你喜欢

转载自www.cnblogs.com/Afrafre/p/11693784.html