NLP English data analysis dry goods! ! ! ——According to the English analysis template and 2021 Meisai C problem analysis actual combat (provide data)

NLP English data analysis

1. A full set of English preprocessing code

# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys 
import re
cacheStopWords=pw.words("english")

def English_processing(sentence):
    if sentence:
        sentence = sentence.lower()  # 大写转小写
           
        for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
            sentence = sentence.lower().replace(ch," ")  # 去除符号
            
        sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
        
        sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
        
        return sentence

2. Statistical word frequency + word cloud graph analysis

Statistical word frequency

def concat_sentence(sen_list):   # 拼接所有句子
    all_sen = ""
    for i in sen_list:
        all_sen+=' '
        all_sen+=str(i)
    return all_sen

def compute_word_fre(sentence):  # 英文句子预处理+统计词频
    if sentence:
        word_fre ={
    
    }
        sentence = English_processing(sentence)
        
        words = sentence.strip().split() # 统计词频   
        
        for word in words :
            word_fre[word]=word_fre.get(word,0)+1
        
        return word_fre
    
def output(word_fre): # 输入词频字典，排序后输出
    if word_fre:
        sort_word = sorted(word_fre.items(),key =lambda s:s[1],reverse = True)
        return sort_word

Word Cloud Graph Analysis

sentences_list is a list of sentences. Enter
concat_sentence to splice the sentences together to form an article, and then count the word frequency.
If it is directly an article, you can skip the first line of code

all_sen = concat_sentence(sentences_list) #注意：只调整输入即可
word_fre = compute_word_fre(all_sen)

import matplotlib.pyplot as plt
from imageio import imread,imsave
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS

wordcloud = WordCloud(background_color='white',collocations=False,mask=imread('cloud.png',pilmode="RGB"),
    max_words=30,random_state=2021,width=1200, height=800).fit_words(word_fre)

# 绘制词云图
plt.imshow(wordcloud, interpolation='bilinear')
wordcloud.to_file("wordcloud.png")
#plt.savefig("other_wordcloud.png",dpi=600) #另一种保存形式

Three, sentiment analysis

NLTK

NLTK, the full name Natural Language Toolkit, is a Python library commonly used in the field of NLP research. It is a module developed on the basis of Python by Steven Bird and Edward Loper of the University of Pennsylvania. It has more than one hundred thousand lines so far. Code. This is an open source project, including datasets, Python modules, tutorials, etc.;

Sentiment analysis in action

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


sentences = ['Hello, world. I am terrible']
for sentence in sentences: 
    print(sentence)
    point = sia.polarity_scores(sentence)
    print(point)
for k in sorted(point): print('{0}: {1}, '.format(k, point[k]), end='')

4. Similarity analysis (LDA, LSI, Tfidf)

Full set of codes

# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys 
import re
cacheStopWords=pw.words("english")

def English_processing(sentence):
    if sentence:
        sentence = sentence.lower()  # 大写转小写
           
        for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
            sentence = sentence.lower().replace(ch," ")  # 去除符号
            
        sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
        
        sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
        
        return sentence
    
def concat_sentence(sen_list):   # 拼接所有句子
    all_sen = ""
    for i in sen_list:
        all_sen+=' '
        all_sen+=str(i)
    return all_sen


import gc
import tqdm
import numpy as np
from gensim import corpora, models, similarities
from collections import defaultdict
import time


class SentenceSimilarity():

    def __init__(self, sentences,min_frequency= 1):
        self.sentences = []
        for i in range(0, len(sentences)):
            self.sentences.append(English_processing(sentences[i]))
        self.sentences_num = len(self.sentences)
        
        self.min_frequency = min_frequency

    # 获取切过词的句子
    def get_cuted_sentences(self):
        cuted_sentences = []

        for sentence in self.sentences:
            cuted_sentences.append(sentence.strip().split())

        return cuted_sentences

    # 构建其他复杂模型前需要的简单模型
    def simple_model(self):
        self.texts = self.get_cuted_sentences()
        
        # 删除低频词
        frequency = defaultdict(int)
        for text in self.texts:
            for token in text:
                frequency[token] += 1
        self.texts = [[token for token in text if frequency[token] > self.min_frequency] for text in self.texts]
        self.dictionary = corpora.Dictionary(self.texts)
        
        self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]

    # tfidf模型
    def TfidfModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.TfidfModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lsi模型
    def LsiModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lda模型
    def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子（比较的句子）进行预处理
    def sentence2vec(self, sentence):
        sentence = English_processing(sentence)
        vec_bow = self.dictionary.doc2bow(sentence.strip().split())
        return self.model[vec_bow]

    def bow2vec(self):
        vec = []
        length = max(self.dictionary) + 1
        for content in self.corpus:
            sentence_vectors = np.zeros(length)
            for co in content:
                sentence_vectors[co[0]] = co[1]  # 将句子出现的单词的tf-idf表示放入矩阵中
            vec.append(sentence_vectors)
        return vec

    # 求最相似的句子
    # input: test sentence
    def similarity(self, sentence):
        sentence_vec = self.sentence2vec(sentence)
        sims = self.index[sentence_vec]
        sim = max(enumerate(sims), key=lambda item: item[1])

        index = sim[0]
        score = sim[1]
        sentence = self.sentences[index]

        return index,score  # 返回最相似的句子的下标和相似度得分

        # 求最相似前k个句子
    def similarity_k(self, sentence, k):
        sentence_vec = self.sentence2vec(sentence)
        t1 = time.time()
        sims = self.index[sentence_vec]
        t2 = time.time()
        print('特征检索耗时：{:.4f}ms, 检索样本总数：{}'.format(t2-t1, self.sentences_num))
        sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]

        indexs = [i[0] for i in sim_k]
        scores = [i[1] for i in sim_k]
        return indexs, scores

Fifth, actual combat drill-2021 Meixi C problem

Results announced and updated