Recurrent Neural Network LSTM Realizes Movie Sentiment Classification

Recurrent Neural Network LSTM Realizes Movie Sentiment Classification

1. Dataset:

In order to consolidate the commonly used text vectorization method of word embedding, here we will complete a case of text sentiment classification

Now we have a classic data set IMDBdata set, address: http://ai.stanford.edu/~amaas/data/sentiment/, which is a comment data containing 50,000 popular movies, including 25,000 training sets and 25,000 test sets. The data format is as follows:

The left side of the figure below is the name, which contains two parts, namely the serial number and the sentiment score (1-4 is neg, 5-10 is pos), and the right side is the comment content

[External link picture transfer failed, the source site may have an anti-theft link mechanism, it is recommended to save the picture and upload it directly (img-PLkZy2Op-1651483348867)(%E5%BE%AA%E7%8E%AF%E7%A5%9E% E7%BB%8F%E7%BD%91%E7%BB%9CLSTM%E5%AE%9E%E7%8E%B0%E7%94%B5%E5%BD%B1%E6%83%85%E6% 84%9F%E5%88%86%E7%B1%BB.assets/p1.jpg)]

However, the simple design of this experiment only realizes the binary classification, that is, the prediction of sum 积极and消极

2. Implementation process

  1. Prepare dataset

  2. build model

  3. model training

  4. model evaluation

3. Dataset preparation

  • Dataset DataSet construction

import pickle

import torch
from torch.utils.data import DataLoader, Dataset
import os
import re

'''
data: 电影评论数据
数据集准备
使用W2S模型将文本序列化
'''
data_base_path = r"./aclImdb"

# 加载-词典(Word2Sequence中保存的模型,事先生成使用w2s_save保存的模型)-用于文本序列化
ws = pickle.load(open("./models/ws.pkl", "rb"))
Max_Len = 40
train_batch_size = 512
test_batch_size = 1024


# 定义tokenize的方法
def tokenize(text):
    # fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
                '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”',
                '“', ]
    text = re.sub("<.*?>", " ", text)
    text = re.sub("|".join(fileters), " ", text)
    return [i.strip().lower() for i in text.split()]


# 准备dataset
class ImdbDataset(Dataset):
    def __init__(self, train=True):
        super(ImdbDataset, self).__init__()
        self.train_data_path = data_base_path + r'\train'
        self.test_data_path = data_base_path + r'\test'
        self.data_path = self.train_data_path if train else self.test_data_path

        # 把所有文件名放入列表
        self.temp_data_path = [os.path.join(self.data_path, 'pos'), os.path.join(self.data_path, 'neg')]
        self.total_file_path_list = []  # 所有评论文件的path
        for path in self.temp_data_path:
            self.total_file_path_list.extend([os.path.join(path, j) for j in os.listdir(path) if j.endswith('.txt')])

    def __getitem__(self, index):
        # 获取评论路径
        path = self.total_file_path_list[index]
        # 获取标签
        label_str = path.split('\\')[-2]
        label = 0 if label_str == 'neg' else 1
        content = tokenize(open(path,encoding='utf-8').read())
        return content, label

    def __len__(self):
        return len(self.total_file_path_list)


def collate_fn(batch):
    # batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果
    # print(batch)
    content, labels = list(zip(*batch))
    content = torch.LongTensor([ws.transform(i,max_len=Max_Len) for i in content])
    labels = torch.LongTensor(labels)
    print(content, labels)
    return content, labels


def get_dataloader(train=True,batch_size=train_batch_size):
    # 2. 实例化,准备dataloader
    dataset = ImdbDataset(train)
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    return dataloader


if __name__ == '__main__':
    # 3. 观察数据输出结果
    for idx, (sentence, label) in enumerate(get_dataloader()):
        print("idx:", idx)
        print("sentence:", sentence)
        print("label:", label)
        break
  • text serialization
import numpy as np
'''
文本序列化,序列化文本类
'''
class Word2Sequence():
    UNK_TAG = "UNK"
    PAD_TAG = "PAD"

    UNK = 0
    PAD = 1

    def __init__(self):
        self.dict = {
    
    
            self.UNK_TAG :self.UNK,
            self.PAD_TAG :self.PAD
        }
        # 是否进行fit 操作
        self.fited = False
        # 是否构建词典操作
        self.build_vocabd = False
        # 词频统计
        self.count = {
    
    }

    def to_index(self,word):
        """word -> index"""
        assert self.fited == True,"必须先进行fit操作"
        return self.dict.get(word,self.UNK)

    def to_word(self,index):
        """index -> word"""
        assert self.fited , "必须先进行fit操作"
        if index in self.inversed_dict:
            return self.inversed_dict[index]
        return self.UNK_TAG

    def __len__(self):
        return len(self.dict)

    def fit(self,sentence):
        for word in sentence:
            self.count[word] = self.count.get(word, 0) + 1
        self.fited = True

    def build_vocab(self, min_count=5, max_count=None, max_features=None):
        '''
        生成词典
        :param sentence: [word1.word2,word3 .....]
        :param min_count: 最小出现的次数
        :param max_count: 最大出现的次数
        :param max_features: 一共保留多少个特征(word)
        :return:
        '''
        # 删除词频中count 最小的word
        if min_count is not None:
            self.count = {
    
    k: v for k, v in self.count.items() if v >= min_count}
        # 删除词频超过限制的word
        if max_count is not None:
            self.count = {
    
    k: v for k, v in self.count.items() if v <= max_count}
        if max_features is not None:
            # 对词频字典排序取前max_features个词语
            temp = sorted(self.count.items(),key=lambda x: x[-1],reverse=True)[:max_features]
            self.count = dict(temp)
        # 给处理好的word编号
        for word in self.count:
            self.dict[word] = len(self.dict)

        # 得到一个翻转的字典(编号:词)
        self.inversed_dict = dict(zip(self.dict.values(), self.dict.keys()))
        self.build_vocabd = True

    def transform(self, sentence, max_len=None):
        """
        实现把句子转化为数组(向量)
        :param sentence: [word1,word2,word3 ....]
        :param max_len: 向量的限制长度
        :return:
        """
        assert self.fited, "必须先进行fit操作"
        assert self.build_vocabd,"必须先进行build_vocab操作"
        if max_len is not None:
            if max_len > len(sentence):
                sentence = sentence + [self.PAD_TAG] * (max_len-len(sentence))  # 填充
            else:
                sentence = sentence[:max_len]  # 裁剪

        return [self.dict.get(word,self.UNK) for word in sentence]

    def inverse_transform(self,indices):
        """
        实现从数组转化为文字
        :param indices: [1,2,3....]
        :return:[word1,word2.....]
        """
        assert self.fited, "必须先进行fit操作"
        assert self.build_vocabd,"必须先进行build_vocab操作"
        return [self.inversed_dict.get(idx) for idx in indices]


if __name__ == '__main__':
    w2s = Word2Sequence()
    w2s.fit(["你", "好", "么"])
    w2s.fit(["你", "好", "哦"])
    w2s.build_vocab(min_count=1)
    print(w2s.dict)
    print(w2s.fited)
    print(w2s.transform(["你","好","嘛"]))
    print(w2s.transform(["你好嘛"],max_len=10))
    print(w2s.inverse_transform([5,2,4]))
    print(len(w2s))
  • Generate a serialized model

slightly

4. Model construction

import os
import torch
import numpy as np
from torch import nn, optim
from DataSet import get_dataloader, ws, Max_Len, test_batch_size
import torch.nn.functional as F
from tqdm import tqdm

'''
IMDB电影评论情感分析(pos,neg)积极和消极-改进版
使用LSTM双向循环神经网络,抽取最后一个时间步的特征用作全连接层特征输入
即:文本 -> num -> vector -> LSTM[last TimeStep] -> 2层全连接 -> softmax
'''


class IMDBLstmmodel(nn.Module):
    def __init__(self):
        super(IMDBLstmmodel,self).__init__()
        # 以下部分为超参数,可以自行修改
        self.hidden_size = 64   # 每一层的LSTM单元数
        self.embedding_dim = 200   # 每个词的向量长度
        self.num_layer = 2   # 隐藏层数
        self.bidriectional = True   # 是否使用双向的LSTM
        self.bi_num = 2 if self.bidriectional else 1  # 是否是双向的LSTM
        self.dropout = 0.5
        self.embedding = nn.Embedding(len(ws),self.embedding_dim, padding_idx=ws.PAD)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_size, num_layers=self.num_layer,
                            bidirectional=self.bidriectional, dropout=self.dropout)

        # 使用两个全连接层,中间使用relu激活函数
        self.fc = nn.Linear(self.hidden_size*self.bi_num, 20)
        self.fc2 = nn.Linear(20, 2)

    def forward(self, x):  # 输入的x的形状[batch_size, sequence_len]
        # x 经过embedding 之后的形状 [batch_size, sequence_len, embedding_dim]
        x = self.embedding(x)
        # 进行轴交换 x的形状变为[sequence_len, batch_size, embdedding_dim]
        x = x.permute(1, 0, 2)
        # x的形状[sequence_len, batch_size, hidden_size] , h_n的形状[num_layer*bi_num, batch, hidden_size], c_n同h_n
        x, (h_n, c_n) = self.lstm(x)
        # 只要最后一个lstm单元处理的结果,即正向的最后一个lstm 和 反向的最后一个处理结果合并,形状为[batch_szie, hidden_size*bi_num]
        out = torch.cat([h_n[-2, :, :], h_n[-1, :, :]], dim=-1)
        out = self.fc(out)
        out = F.relu(out)
        out = self.fc2(out)
        return F.log_softmax(out,dim=-1)

5. Model training

# 实例化模型
model = IMDBLstmmodel()
# 实例化优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)
if os.path.exists('./models/lstm_model.pkl'):  # 是否有已训练的模型,方便快速训练
    model.load_state_dict(torch.load('./models/lstm_model.pkl'))
    optimizer.load_state_dict(torch.load('./models/lstm_optimizer.pkl'))


# 定义训练函数
def train(epoch):
    data_loader = get_dataloader()
    for idx, (input, label) in tqdm(enumerate(data_loader),total=len(data_loader),ascii=True,desc='第%d轮训练'%epoch):
        # 梯度清零
        optimizer.zero_grad()
        # 使用模型进行预测
        ouput = model(input)
        # 计算损失
        loss = F.nll_loss(ouput, label)
        # 误差反向传播
        loss.backward()
        # 梯度更新
        optimizer.step()
        if idx == len(data_loader)-1:
            print('result: 第%d轮次训练,损失%f'%(epoch,loss.item()))
            torch.save(model.state_dict(), "./models/lstm_model.pkl")  # 模型保存
            torch.save(optimizer.state_dict(), './models/lstm_optimizer.pkl')  # 优化器保存

Training effect (here I have trained in advance, so the loss is already very low)

[External link picture transfer failed, the source site may have an anti-theft link mechanism, it is recommended to save the picture and upload it directly (img-I6jRUy1q-1651483348871)(%E5%BE%AA%E7%8E%AF%E7%A5%9E% E7%BB%8F%E7%BD%91%E7%BB%9CLSTM%E5%AE%9E%E7%8E%B0%E7%94%B5%E5%BD%B1%E6%83%85%E6% 84%9F%E5%88%86%E7%B1%BB.assets/1651481999488.png)]

6. Model evaluation


# 模型评估
def test():
    model.eval()
    loss_ = []
    acc_ = []
    with torch.no_grad():
        data_loader = get_dataloader(train=False,batch_size=test_batch_size)
        for idx, (input, label) in tqdm(enumerate(data_loader),total=len(data_loader),ascii=True,desc='模型评估'):
            ouput = model(input)
            loss = F.nll_loss(ouput,label,reduction="mean")
            loss_.append(loss.item())
            pred = ouput.max(dim=1)[1]  # [batch_size, 1]
            acc_.append(pred.eq(pred).float().mean())  # 每个批次的平均准确率
    print('模型损失%f,平均准确率%f' % (np.mean(loss_), np.mean(acc_)))

The accuracy rate is above 99%

Guess you like

Origin blog.csdn.net/m0_49501453/article/details/124543694