import pandas as pd
df = pd.read_csv("./labeledTrainData.tsv", sep='\t', escapechar='\\')
df.head(10)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
|
id |
sentiment |
review |
0 |
5814_8 |
1 |
With all this stuff going down at the moment w… |
1 |
2381_9 |
1 |
“The Classic War of the Worlds” by Timothy Hin… |
2 |
7759_3 |
0 |
The film starts with a manager (Nicholas Bell)… |
3 |
3630_4 |
0 |
It must be assumed that those who praised this… |
4 |
9495_8 |
1 |
Superbly trashy and wondrously unpretentious 8… |
5 |
8196_8 |
1 |
I dont know why people think this is such a ba… |
6 |
7166_2 |
0 |
This movie could have been very good, but come… |
7 |
10633_1 |
0 |
I watched this video at a friend’s house. I’m … |
8 |
319_1 |
0 |
A friend of mine bought this film for £1, and … |
9 |
8713_10 |
1 |
<br /><br />This movie is full of references. … |
commentList = df["review"].tolist()
commentSplit = commentList[0:100]
commentStr = "".join(commentSplit)
from bs4 import BeautifulSoup as bs
soup = bs(commentStr,"lxml").get_text()
soup2 = str.lower(soup)
import re
result = re.findall('[a-zA-Z]+',soup2)
result
#部分结果
['with',
'all',
'this',
'stuff',
'going',
'down',
'at',
'the',
'moment',
'with',
'mj',
'i',
've',
'started',
'listening',
'to',
'his',
'music',
'watching',
'the',
'odd',
'documentary',
'here',
'and',
'there',
'watched',
'the',
...]
data = pd.read_csv('stopwords.txt')
list2 = data.values
list3 = []
for i in range(len(list2)):
list3.append(list2[i][0])
list4 = str(list3)
result2 = re.findall('\w+',list4)
result2
#部分结果
['ll',
'm',
're',
's',
't',
've',
'ZT',
'ZZ',
'a',
'a',
's',
'able',
'about',
'above',
'abst',
'accordance',
'according',
'accordingly',
'across',
'act',
'actually',
'added',
'adj',
'adopted',
'affected',
'affecting',
'affects',
'after',
'afterwards',
'again',
'against',
'ah',
'ain',
't',
'all',
'allow',
'allows',
'almost',
'alone',
'along',
'already',
'also',
'although',
'always',
'am',
'among',
'amongst',
'an',
'and',
'announce',
'another',
'any',
'anybody',
'anyhow',
'anymore',
'anyone',
'anything',
'anyway',
'anyways',
'anywhere',
'apart',
'apparently',
'appear',
'appreciate',
'appropriate',
'approximately',
...]
list1 = [w for w in result if w not in result2]
list1
#部分结果
['stuff',
'moment',
'mj',
'started',
'listening',
'music',
'watching',
'odd',
'documentary',
'watched',
'wiz',
'watched',
'moonwalker',
'insight',
'guy',
'cool',
'eighties',
'mind',
'guilty',
'innocent',
'moonwalker',
'biography',
'feature',
'film',
'remember',
'cinema',
'originally',
'released',
'subtle',
...]
from collections import Counter
dict = Counter(list1).most_common(500)
list5=[]
for i in range(len(dict)):
list5.append(dict[i][0])
list6=[]
for j in range(len(dict)):
list6.append(dict[j][1])
import pyecharts as pye
wordcloud = pye.WordCloud(width=1500, height=1000)
wordcloud.add("", list5, list6, word_size_range=[20, 100])
wordcloud