NLP English data analysis
1. A full set of English preprocessing code
# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys
import re
cacheStopWords=pw.words("english")
def English_processing(sentence):
if sentence:
sentence = sentence.lower() # 大写转小写
for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
sentence = sentence.lower().replace(ch," ") # 去除符号
sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
return sentence
2. Statistical word frequency + word cloud graph analysis
Statistical word frequency
def concat_sentence(sen_list): # 拼接所有句子
all_sen = ""
for i in sen_list:
all_sen+=' '
all_sen+=str(i)
return all_sen
def compute_word_fre(sentence): # 英文句子预处理+统计词频
if sentence:
word_fre ={
}
sentence = English_processing(sentence)
words = sentence.strip().split() # 统计词频
for word in words :
word_fre[word]=word_fre.get(word,0)+1
return word_fre
def output(word_fre): # 输入词频字典,排序后输出
if word_fre:
sort_word = sorted(word_fre.items(),key =lambda s:s[1],reverse = True)
return sort_word
Word Cloud Graph Analysis
sentences_list is a list of sentences. Enter
concat_sentence to splice the sentences together to form an article, and then count the word frequency.
If it is directly an article, you can skip the first line of code
all_sen = concat_sentence(sentences_list) #注意:只调整输入即可
word_fre = compute_word_fre(all_sen)
import matplotlib.pyplot as plt
from imageio import imread,imsave
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
wordcloud = WordCloud(background_color='white',collocations=False,mask=imread('cloud.png',pilmode="RGB"),
max_words=30,random_state=2021,width=1200, height=800).fit_words(word_fre)
# 绘制词云图
plt.imshow(wordcloud, interpolation='bilinear')
wordcloud.to_file("wordcloud.png")
#plt.savefig("other_wordcloud.png",dpi=600) #另一种保存形式
Three, sentiment analysis
NLTK
NLTK, the full name Natural Language Toolkit, is a Python library commonly used in the field of NLP research. It is a module developed on the basis of Python by Steven Bird and Edward Loper of the University of Pennsylvania. It has more than one hundred thousand lines so far. Code. This is an open source project, including datasets, Python modules, tutorials, etc.;
Sentiment analysis in action
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sentences = ['Hello, world. I am terrible']
for sentence in sentences:
print(sentence)
point = sia.polarity_scores(sentence)
print(point)
for k in sorted(point): print('{0}: {1}, '.format(k, point[k]), end='')
4. Similarity analysis (LDA, LSI, Tfidf)
Full set of codes
# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys
import re
cacheStopWords=pw.words("english")
def English_processing(sentence):
if sentence:
sentence = sentence.lower() # 大写转小写
for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
sentence = sentence.lower().replace(ch," ") # 去除符号
sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
return sentence
def concat_sentence(sen_list): # 拼接所有句子
all_sen = ""
for i in sen_list:
all_sen+=' '
all_sen+=str(i)
return all_sen
import gc
import tqdm
import numpy as np
from gensim import corpora, models, similarities
from collections import defaultdict
import time
class SentenceSimilarity():
def __init__(self, sentences,min_frequency= 1):
self.sentences = []
for i in range(0, len(sentences)):
self.sentences.append(English_processing(sentences[i]))
self.sentences_num = len(self.sentences)
self.min_frequency = min_frequency
# 获取切过词的句子
def get_cuted_sentences(self):
cuted_sentences = []
for sentence in self.sentences:
cuted_sentences.append(sentence.strip().split())
return cuted_sentences
# 构建其他复杂模型前需要的简单模型
def simple_model(self):
self.texts = self.get_cuted_sentences()
# 删除低频词
frequency = defaultdict(int)
for text in self.texts:
for token in text:
frequency[token] += 1
self.texts = [[token for token in text if frequency[token] > self.min_frequency] for text in self.texts]
self.dictionary = corpora.Dictionary(self.texts)
self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]
# tfidf模型
def TfidfModel(self):
self.simple_model()
# 转换模型
self.model = models.TfidfModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lsi模型
def LsiModel(self):
self.simple_model()
# 转换模型
self.model = models.LsiModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lda模型
def LdaModel(self):
self.simple_model()
# 转换模型
self.model = models.LdaModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# 对新输入的句子(比较的句子)进行预处理
def sentence2vec(self, sentence):
sentence = English_processing(sentence)
vec_bow = self.dictionary.doc2bow(sentence.strip().split())
return self.model[vec_bow]
def bow2vec(self):
vec = []
length = max(self.dictionary) + 1
for content in self.corpus:
sentence_vectors = np.zeros(length)
for co in content:
sentence_vectors[co[0]] = co[1] # 将句子出现的单词的tf-idf表示放入矩阵中
vec.append(sentence_vectors)
return vec
# 求最相似的句子
# input: test sentence
def similarity(self, sentence):
sentence_vec = self.sentence2vec(sentence)
sims = self.index[sentence_vec]
sim = max(enumerate(sims), key=lambda item: item[1])
index = sim[0]
score = sim[1]
sentence = self.sentences[index]
return index,score # 返回最相似的句子的下标和相似度得分
# 求最相似前k个句子
def similarity_k(self, sentence, k):
sentence_vec = self.sentence2vec(sentence)
t1 = time.time()
sims = self.index[sentence_vec]
t2 = time.time()
print('特征检索耗时:{:.4f}ms, 检索样本总数:{}'.format(t2-t1, self.sentences_num))
sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]
indexs = [i[0] for i in sim_k]
scores = [i[1] for i in sim_k]
return indexs, scores
Fifth, actual combat drill-2021 Meixi C problem
Results announced and updated