自然语言分析数据预处理---获取国外电影评论中出现多的热词

import pandas as pd
#读取labeledTrainData.tsv文件
df = pd.read_csv("./labeledTrainData.tsv", sep='\t', escapechar='\\')
df.head(10)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
id sentiment review
0 5814_8 1 With all this stuff going down at the moment w…
1 2381_9 1 “The Classic War of the Worlds” by Timothy Hin…
2 7759_3 0 The film starts with a manager (Nicholas Bell)…
3 3630_4 0 It must be assumed that those who praised this…
4 9495_8 1 Superbly trashy and wondrously unpretentious 8…
5 8196_8 1 I dont know why people think this is such a ba…
6 7166_2 0 This movie could have been very good, but come…
7 10633_1 0 I watched this video at a friend’s house. I’m …
8 319_1 0 A friend of mine bought this film for £1, and …
9 8713_10 1 <br /><br />This movie is full of references. …
# 获取到评论数据的review这一列,并转化成列表
commentList = df["review"].tolist()
# 获取前100条平路数据
commentSplit = commentList[0:100]
# 将前100条评论数据拼接成一个字符串
commentStr = "".join(commentSplit)
# 第一步:使用Beautifulsoup把评论中的html标签去除
from bs4 import BeautifulSoup as bs
soup = bs(commentStr,"lxml").get_text()
# 将获得信息全转为小写,方便后面与stopwords匹配进行无意义语句的去除
soup2 = str.lower(soup)
import re#导入正则方法
#使用正则提取评论中英文字符串
result = re.findall('[a-zA-Z]+',soup2)
result
#部分结果
['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',

 ...]
#导入停用词文件
data = pd.read_csv('stopwords.txt')
#获取停用词
list2 = data.values
list3 = []
for i in range(len(list2)):
    list3.append(list2[i][0])
#正则去除列表中字符串元素内出现的空格
list4 = str(list3)
result2 = re.findall('\w+',list4)
#查看结果
result2
#部分结果
['ll',
 'm',
 're',
 's',
 't',
 've',
 'ZT',
 'ZZ',
 'a',
 'a',
 's',
 'able',
 'about',
 'above',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'added',
 'adj',
 'adopted',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'again',
 'against',
 'ah',
 'ain',
 't',
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',

 ...]
#推导式去除两列表中相同的元素
list1 = [w for w in result if w not in result2]
list1
#部分结果
['stuff',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary',
 'watched',
 'wiz',
 'watched',
 'moonwalker',
 'insight',
 'guy',
 'cool',
 'eighties',
 'mind',
 'guilty',
 'innocent',
 'moonwalker',
 'biography',
 'feature',
 'film',
 'remember',
 'cinema',
 'originally',
 'released',
 'subtle',

 ...]
#导入统计方法
from collections import Counter
#获取前五百热词
dict = Counter(list1).most_common(500)
#获取热词list5及其出现次数list6
list5=[]
for i in range(len(dict)):
    list5.append(dict[i][0])

list6=[]
for j in range(len(dict)):
    list6.append(dict[j][1])
#导入pyecharts
import pyecharts as pye
#进行绘图
wordcloud = pye.WordCloud(width=1500, height=1000)
wordcloud.add("", list5, list6, word_size_range=[20, 100])
wordcloud

这里写图片描述

猜你喜欢

转载自blog.csdn.net/qq_27171347/article/details/81330382