'''
Date: 20200720
Name: Jack Zhao
Descr: 使用NLTK进行情感分析
'''
# 代码汇总
# 导入相关包
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn
import string
# 读取数据
data = pd.DataFrame(pd.read_csv('./reviews.csv'))
lis = list(data['comments'])
# 停用词处理
stop = stopwords.words("english") + list(string.punctuation)
# 进行词性标注,生成词性标注列表
ttt = [[i for i in word_tokenize(str(text).lower()) if i not in stop] for text in lis[:1000]] #这里改数据量
ttt = [nltk.pos_tag(t) for t in ttt]
# 计数
word_tag_fq = [nltk.FreqDist(t) for t in ttt]
wordlist = [t.most_common() for t in word_tag_fq] # 合并计数
# 进行词性归类
df = []
for wls in wordlist:
key = []
part = []
frequency = []
for i in range(len(wls)):
key.append(wls[i][0][0])
part.append(wls[i][0][1])
frequency.append(wls[i][1])
textdf = pd.DataFrame({
'key':key,
'part':part,
'frequency':frequency},columns=['key','part','frequency']
)
df.append(textdf)
n = ['NN','NNP','NNPS','NNS','UH']
v = ['VB','VBD','VBG','VBN','VBP','VBZ']
a = ['JJ','JJR','JJS']
r = ['RB','RBR','RBS','RP','WRB']
for textdf in df:
for i in range(len(textdf['key'])):
z = textdf.iloc[i,1]
if z in n:
textdf.iloc[i,1]='n'
elif z in v:
textdf.iloc[i,1]='v'
elif z in a:
textdf.iloc[i,1]='a'
elif z in r:
textdf.iloc[i,1]='r'
else:
textdf.iloc[i,1]=''
# 单词情感得分
last_df = []
for textdf in df:
score = []
for i in range(len(textdf['key'])):
m = list(swn.senti_synsets(textdf.iloc[i,0],textdf.iloc[i,1]))
s = 0
ra = 0
if len(m) > 0:
for j in range(len(m)):
s += (m[j].pos_score()-m[j].neg_score())/(j+1)
ra += 1/(j+1)
score.append(s/ra)
else:
score.append(0)
textdf = pd.concat([textdf,pd.DataFrame({'score':score})],axis=1) # 其实是创建副本,没有存进去,所以创建新列表
last_df.append(textdf)
print(textdf) # 打印每个句子的单词得分
# 句子得分计算
scorelis = [sum(last_df[i]['score']) for i in range(len(last_df))]
data = pd.DataFrame(enumerate(scorelis),columns=['index','score_sentence'])
data.to_csv('./comments_score.csv')
Python项目实战-一文搞定Sentiwordnet-NLTK情感分析
猜你喜欢
转载自blog.csdn.net/weixin_40539952/article/details/107473913
今日推荐
周排行