电影评论数据集
百度文库地址
https://pan.baidu.com/s/15ReMZi0gGo0MA5pn-1h3LQ
qknb
字典参考

1、基于词袋模型的逻辑回归情感分类

# -*- coding: UTF-8 -*-
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import itertools
import jieba
import os
cur_dir = os.path.dirname(os.path.abspath(__file__))
print(cur_dir)

# 超参数
stopwords_path = os.path.join(cur_dir, '../testdata/chineseStopWords.txt') # 停用词字典地址
# 加载停用词
stopwords = [i.strip() for i in open(stopwords_path, encoding="utf-8").readlines()]

###########################词袋模型特征############################################
#重组为新的句子
def clean_text(text):
    """
    去掉html标签、移除标点、切分成词/token、去掉停用词、重组为新的句子
    :param text:
    :return:
    """
    # print(text)
    words = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', text)), cut_all=False)
    words = [w for w in words if w not in stopwords]
    # print(words)
    return ' '.join(words)

#混淆矩阵
def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

if __name__=='__main__':
    #读取数据
    df = pd.read_csv('../testdata/ratings.csv', sep=',', escapechar='\\')
    print(df.head(5))
    #数据清洗,对df中的每一个Serial进行清洗
    df['clean_comment'] = df.comment.apply(clean_text)
    print(df['clean_comment'])
    #抽取bag of words特征(用sklearn的CountVectorizer)
    vectorizer = CountVectorizer(max_features=50)
    train_data_features = vectorizer.fit_transform(df.clean_comment).toarray()
    print(train_data_features)

    # 数据切分
    X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.rating, test_size=0.2,random_state=0)
    print(X_train,X_test,y_train,y_test)
    # ### 训练分类器
    LR_model = LogisticRegression()
    LR_model = LR_model.fit(X_train, y_train)
    y_pred = LR_model.predict(X_test)
    print(y_pred)
    print(y_test)
    cnf_matrix = confusion_matrix(y_test, y_pred)
    print(cnf_matrix)

    print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))

    print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (
                cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))

    # Plot non-normalized confusion matrix
    class_names = [0, 1]
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
    plt.show()

2、基于word2vec词向量模型的逻辑回归情感分类

# -*- coding: UTF-8 -*-
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import nltk
import warnings
from gensim.models.word2vec import Word2Vec
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import itertools
warnings.filterwarnings("ignore")
import jieba
import os
cur_dir = os.path.dirname(os.path.abspath(__file__))
print(cur_dir)

# 超参数
stopwords_path = os.path.join(cur_dir, '../testdata/chineseStopWords.txt') # 停用词字典地址
# 加载停用词
stopwords = [i.strip() for i in open(stopwords_path, encoding="utf-8").readlines()]

def clean_text(text, remove_stopwords=False):
    # print(text)
    words = jieba.lcut(''.join(re.findall('[\u4e00-\u9fa5]', text)), cut_all=False)
    words = [w for w in words if w not in stopwords]
    # print(words)
    return ' '.join(words)

    return words

def split_sentences(review):
    #print(type(review))
    raw_sentences=tokenizer.tokenize(str(review).strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

def to_review_vector(review):
    global word_vec
    review = clean_text(review, remove_stopwords=True)
    # print (review)
    # words = nltk.word_tokenize(review)
    word_vec = np.zeros((1, 300))
    for word in review:
        # word_vec = np.zeros((1,300))
        if word in model:
            word_vec += np.array([model[word]])
    # print (word_vec.mean(axis = 0))
    return pd.Series(word_vec.mean(axis=0))

def plot_confusion_matrix(cm, classes,title='Confusion matrix',cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

if __name__ == '__main__':
    #读取数据
    df = pd.read_csv('../testdata/ratings.csv', sep=',', escapechar='\\')
    #数据清洗
    df['clean_review'] = df.comment.apply(clean_text)
    review_part = df['clean_review']
    #nltk库分词
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sum(review_part.apply(split_sentences), [])
    sentences_list = []
    for line in sentences:
        sentences_list.append(nltk.word_tokenize(str(line).strip()))

    #word2vec
    num_features = 300  # Word vector dimensionality
    min_word_count = 40  # Minimum word count
    num_workers = 4  # Number of threads to run in parallel
    context = 10  # Context window size
    model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
    model = Word2Vec(sentences_list, workers=num_workers, size=num_features, min_count=min_word_count, window=context)
    model.init_sims(replace=True)
    model.save('word2vec.models')

    train_data_features = df.review.apply(to_review_vector)

    X_train, X_test, y_train, y_test = train_test_split(train_data_features, df.sentiment, test_size=0.2, random_state=0)

    LR_model = LogisticRegression()
    LR_model = LR_model.fit(X_train, y_train)
    y_pred = LR_model.predict(X_test)
    cnf_matrix = confusion_matrix(y_test, y_pred)

    print("Recall metric in the testing dataset: ", cnf_matrix[1, 1] / (cnf_matrix[1, 0] + cnf_matrix[1, 1]))
    print("accuracy metric in the testing dataset: ", (cnf_matrix[1, 1] + cnf_matrix[0, 0]) / (
                cnf_matrix[0, 0] + cnf_matrix[1, 1] + cnf_matrix[1, 0] + cnf_matrix[0, 1]))

    # Plot non-normalized confusion matrix
    class_names = [0, 1]
    plt.figure()
    plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')
    plt.show()

NLP学习（十一）-NLP实战之电影评分数据的情感分析-Python3

1、基于词袋模型的逻辑回归情感分类

2、基于word2vec词向量模型的逻辑回归情感分类

猜你喜欢