HW04-Sentence Classification notes
1. Path and Warnings
Path_prefix = './'
import warnings
warnings. filterwarnings('ignore')
2. import
import os
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader, Dataset
import argparse
#Gensim reference TF-IDF,LSA,LDA, word2vec and so on
from gensim.models import word2vec
from gensim.models import Word2vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
3. Define Load Data and Evaluation Functions
def load_training_data(path='training_label.txt')
if 'training_label' in path:
with open(path) as f
lines = f.readlines()
return x, y
else # training_nolabel
return x
def load_testing_data(path='testing_data')
with open(path) as f
lines=f.readlines()
return X
4. Define Train Word to Vector of word embedding
def train_word2vec(x):
model = Word2Vec()
return model
if __name == "__main__":
#load training data
traing_x, y = load_training_data('training_label.txt')
traing_x_no_labet = load_training_data('traing_nolabel.txt')
#load testing data
test_x = load_testing_data('testing_data.txt')
#training word(label+nolabel) transfer to vector
model = traing_word2vec(train_x + training_x_no_label + test_x)
#save model
model.save(os.path.join(path_prefix, 'w2v_all.model'))
5. Data Preprocess
class Preprocess():
def __init__(self, sentences, sen_len, w2v_path=os.path.join(path_prefix, 'w2v_all.model'))
self.w2v_path = w2v_path #word2vec store path
self.sentences = sentences
self.sen_len = sen_len
self.idx2word = [] #list, self.idx2word[1] = 'Minovo'
self.word2idx = {
} #dict, self.word2idx['Minovo'] = 1
self.embedding_matrix=[] # matrix for embedding vector list, self.embedding_matrix[1] = 'Minovo' 's vector, or for a sentence: self.embedding_matrix[word2idx['Minovo'] 's vecort[0, 1, 0, 0, 0... ]
def get_w2v_model(self):
self.embedding = Word2Vec.load(self.w2v_path)
self.embedding_dim = self.embedding.vector_size
def add_embedding(self, word):
vecotr = torch.empty(1, self.embedding_dim)
torch.nn.init.uniform_(vector)
self.wor2idx[word] = len(self.word2idx)
self.idx2word.append(word)
self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
def make_embedding(self):
if load:
self.get_w2v_model() # loading word2vec embedding model
else:
raise NotImplemented
for i, word in enumerate(self.embedding.wv.index_to_key):
self.word2idx[word] = len(self.word2idx]
self.idx2word.append(word)
self.embedding_matrix.append = (self.embedding.wv[wor])
self.embedding_matrix = torch.tensor(self.embedding_matrix) # embedding_matrix transform to Tensor
self.add_embedding("<PAD>")
self.add_embedding("<UNK>")
return self.embedding_matrix
def pad_sequence(self, sentence):
# same sentence lenth
...
assert len(sentence) == self.sen_len
return sentence
def sentence_word2idx(self):
# sentences's word transform to index
sentence_list = []
for i, sen in enumerate(self.sentences):
sentence_idx = []
for word in sen:
if word in self.word2idx.keys():
sentence_idx.append(self.word2idx[word]
else:
sentence_idx.append(self.word2idx["<UNK>"]
sentence_idx = self.pad_sequence(sentence_idx)
sentence_list.append(sentence_idx)
def labels_to_tensor(self, y):
#label transform to tensor
return torch Longtensor(y)
6. Define Dataset
class TwitterDataset(data.Dataset):
def __init__(self, X, y):
self.data = X
self.label = y
def __getitem__(self, idx):
if self.label is None: return self.data[idx]
def __len__(self):
return len(self.data)
7. Define LSTM
class LSRM_Net(nn.Module):
def __init__(self.embedding, embedding_dim, hidden_dim, num_layer, dropout, fix_embedding=True):
super(LSTM, sefl).__init__()
self.embedding = torch.nn.Embedding(embedding.size(0),(1))
self.embedding.weight = torch.nn.Parameter(embedding)
self.embedding.weight.requires_grad = False if fix_embedding else True
self.embedding_dim = embedding.size(1)
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(hidden_dim, 1), nn.Sigmoid())
def forward(self, inputs):
inputs = self.embedding(inputs)
x, _ = self.lstm(inputs, None)
# the hidden state from last layer of lstm into classifier
x = x[:, -1, :]
x = self.classifier(x)
return x
7.Define Training and Validation Function
def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
# model outputs total parameters numbers, trainable parameters numbers
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.sum() for p in model.parameters() if p.requires_grad)
model.train()
criterion = nn.BCELoss # define binary cross entropy loss
t_batch = len(train) # training data batch size
v_batch = len(valic) # Validation data batch size
# optimizer using Adam
optimizer = optim.Adam(model.parameters(), lr)
if __name__ == '__name':
for epoch in range(n_epoch):
for i, (inputs, labels) in enumerate(train):
inputs = inputs.to(device, dtype=torch.long)
labels = labels.to(device, dtype=torch.float)
# each times initial to zero
optimizer.zero_grad()
outputs = model(inputs)
outputs = outputs.squeeze()
# binary cross entropy loss
loss = criterion(outputs, labels)
# loss gradient
loss.backward()
# update model parameter
optimizer.stem()
correct = evaluation(outputs, labels)
total_acc += (correct / batch_size)
total_loss += loss.item()
#validation
model.eval()
with torch.no_grad():
for i, (inputs, labels) in enumerate(valid):
inputs
labels
outputs
loss
correct
total_acc
total_loss
...
if totall_acc > best_acc
best_acc = total_acc
torch.save(model)
model.train()
8. Define Testing Function
def testing(batch_size, test_loader, model, device):
model.eval()
ret_output = []
with torch.no_grad():
for i, inputs in enumerate(test_loader):
inputs = inputs.to(device, dtype=torch.long)
outputs =model(inputs)
outputs = outputs.squeeze()
outputs[outputs >= 0.5] = 1
outputs[outputs < 0.5] = 0
ret_output += outputs.int().tolist()
return ret_out
9. Training Preprocess() and training()
torch.cuda.is_availabel()
train_with_label = os.path.join(path_prefix, 'training_label.txt')
train_no_label
testing_data
w2v_path
...
sen_len = 30
fix_embedding = True
batch_size = 128
epoch = 5
lr = 0.001
model_dir = path_prefix
train_x, y = load_training_data(train_with_label)
train_x_no_labe
preprocess = Preprocess(train_x, sen_len, w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)
10. Define Model
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding)
model = model.to(device)
X_train, X_val, y_train, y_val = train_test_split(train_x, y, test_size=0.1, random_state=1, stratity=y)
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_bal, y = y_val)
train_loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True, num_workers=0)
val_loader
...
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device
11. Testing and Predicting
test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path)
embedding = preporcess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.Dataloader(dataset, batch_size, shuffle, num_workers)
model = torch.load(path)
outputs = testing(batch_size, test_loader, model, device)
tmp = pd.DataFrame()
tmp.to_csv('predict.csv')
print('Finished Predicting)
Result: