import json
import urllib.request
import urllib.parse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import jieba as jb
from wordcloud import WordCloud
from PIL import Image
start_page = int(input('请输入起始页码'))
end_page = int(input('请输入结束页码'))
comment=[]
for i in range(start_page,end_page+1):
url = 'https://api.bilibili.com/x/v2/reply?jsonp=jsonp&'
data = {
'pn':i,
'type':1,
'oid':95762636,
'sort':2,
}
data = urllib.parse.urlencode(data)
url = url + data
headers = {
"cookie":"CURRENT_FNVAL=16; _uuid=52728A0A-2C90-F5EB-6B5D-476076B10BFD23666infoc; buvid3=8BDC790C-2F02-4EBF-9E68-1D010059DE2C53913infoc; LIVE_BUVID=AUTO5815785509367627; rpdid=|(JlRYkk||m)0J'ul~)ll|mlk; bsource=seo_baidu; sid=kgiba6jz; DedeUserID=600947534; DedeUserID__ckMd5=1a12f6d73050dbb9; SESSDATA=b25c4c4d%2C1606528657%2C56900*61; bili_jct=0df2e70724b7918dc3c5cb6d33998a75",
"user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(type(content))
content=json.loads(content)
print(type(content))
d = content['data']
replies = d['replies']
length=len(replies)
for j in range(length):
dict=replies[j]
message_j=dict['content']['message']
comment.append(message_j)
rep_list=dict['replies']
if rep_list:
rep_len=len(rep_list)
for q in range(rep_len):
q=int(q)
rep=rep_list[q]['content']['message']
comment.append(rep)
print(comment[4])
import xlsxwriter
workbook=xlsxwriter.Workbook("小艾大叔.xlsx")
worksheet = workbook.add_worksheet("first_sheet")
worksheet.write_column('A2',comment)
workbook.close()
q = comment
contents=[]
for i in q:
arr = np.array(jb.lcut(i))
new_arr = [ ]
if len(arr)>0:
for j in arr:
if len(j)>1:
new_arr.append(j)
contents.append(new_arr)
print(contents)
stopwords = pd.read_table('stopwords.txt',sep='/n',header=None,names=['word'],encoding='utf8')
sw_list = stopwords.word.tolist()
content_clear=[]
all_word=[]
for line in contents:
line_clear=[]
for word in line:
if word not in sw_list:
line_clear.append(word)
all_word.append(word)
content_clear.append(line_clear)
df_all_word = pd.DataFrame({'all_word':all_word})
word_count=df_all_word.groupby(by=['all_word']).all_word.agg({'count':np.size})
word_dict = word_count.sort_values(by=['count'],ascending=False).iloc[:80].to_dict()['count']
bg = np.array(Image.open('bg.png'))
wcObj = WordCloud(font_path='simhei.ttf',max_font_size=180,width=400,height=200,mask=bg,mode='RGBA',scale=5,)
wcObj.fit_words(word_dict)
plt.figure(figsize=(30,15))
plt.imshow(wcObj)
plt.axis('off')