HW04-Sentence Classification notes

HW04-Sentence Classification notes

1. Path and Warnings

 Path_prefix  = './'
import warnings
warnings. filterwarnings('ignore')

2. import

import os
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader, Dataset
import argparse
#Gensim reference TF-IDF,LSA,LDA, word2vec and so on
from gensim.models import word2vec
from gensim.models import Word2vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

3. Define Load Data and Evaluation Functions

def load_training_data(path='training_label.txt')
	if 'training_label' in path:
		with open(path) as f
			lines = f.readlines()
		return x, y
	else # training_nolabel
		return x

def load_testing_data(path='testing_data')
	with open(path) as f
		lines=f.readlines()
	return X

4. Define Train Word to Vector of word embedding

def train_word2vec(x):
	model = Word2Vec()
	return model
if __name == "__main__":
	#load training data
	traing_x, y = load_training_data('training_label.txt')
	traing_x_no_labet = load_training_data('traing_nolabel.txt')
	#load testing data
	test_x = load_testing_data('testing_data.txt')
	#training word(label+nolabel) transfer to vector
	model = traing_word2vec(train_x + training_x_no_label + test_x)
	#save model
	model.save(os.path.join(path_prefix, 'w2v_all.model'))

5. Data Preprocess

class Preprocess():
	def __init__(self, sentences, sen_len, w2v_path=os.path.join(path_prefix, 'w2v_all.model'))
	self.w2v_path = w2v_path #word2vec store path
	self.sentences = sentences 
	self.sen_len = sen_len
	self.idx2word = [] #list, self.idx2word[1] = 'Minovo'
	self.word2idx = {
    
    } #dict, self.word2idx['Minovo'] = 1  
	self.embedding_matrix=[] # matrix for embedding vector list, self.embedding_matrix[1] = 'Minovo' 's vector, or for a sentence: self.embedding_matrix[word2idx['Minovo'] 's vecort[0, 1, 0, 0, 0... ]

	def get_w2v_model(self):
		self.embedding = Word2Vec.load(self.w2v_path)
		self.embedding_dim = self.embedding.vector_size
	
	def add_embedding(self, word):
		vecotr = torch.empty(1, self.embedding_dim)
		torch.nn.init.uniform_(vector)
		self.wor2idx[word] = len(self.word2idx)
		self.idx2word.append(word)
		self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
	
	def make_embedding(self):
		if load:
			self.get_w2v_model() # loading word2vec embedding model
		else:
			raise NotImplemented
		for i, word in enumerate(self.embedding.wv.index_to_key):
			self.word2idx[word] = len(self.word2idx]
			self.idx2word.append(word)
			self.embedding_matrix.append = (self.embedding.wv[wor])
		self.embedding_matrix = torch.tensor(self.embedding_matrix) # embedding_matrix transform to Tensor
		self.add_embedding("<PAD>")
		self.add_embedding("<UNK>")
		return self.embedding_matrix
	
	def pad_sequence(self, sentence):
	# same sentence lenth
		...
		assert len(sentence) == self.sen_len
		return sentence
		
	def sentence_word2idx(self):
	# sentences's word transform to index
	sentence_list = []
	for i, sen in enumerate(self.sentences):
		sentence_idx = []
		for word in sen:
			if word in self.word2idx.keys():
					sentence_idx.append(self.word2idx[word]
			else:
			sentence_idx.append(self.word2idx["<UNK>"]
		sentence_idx = self.pad_sequence(sentence_idx)
		sentence_list.append(sentence_idx)
	
	def labels_to_tensor(self, y):
	#label transform to tensor
		return torch Longtensor(y)

6. Define Dataset

class TwitterDataset(data.Dataset):
	def __init__(self, X, y):
		self.data = X
		self.label = y
	
	def __getitem__(self, idx):
		if self.label is None: return self.data[idx]

	def __len__(self):
		return len(self.data)

7. Define LSTM

class LSRM_Net(nn.Module):
	def __init__(self.embedding, embedding_dim, hidden_dim, num_layer, dropout, fix_embedding=True):
		super(LSTM, sefl).__init__()
		self.embedding = torch.nn.Embedding(embedding.size(0),(1))
		self.embedding.weight = torch.nn.Parameter(embedding)
		self.embedding.weight.requires_grad = False if fix_embedding else True
		self.embedding_dim = embedding.size(1)
		self.hidden_dim = hidden_dim
		self.num_layers = num_layers
		self.dropout = dropout
		self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
		self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(hidden_dim, 1), nn.Sigmoid())
	
	def forward(self, inputs):
		inputs = self.embedding(inputs)
		x, _ = self.lstm(inputs, None)
		# the hidden state from last layer of lstm into classifier
		x = x[:, -1, :]
		x = self.classifier(x)
		return x 

7.Define Training and Validation Function

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
	# model outputs total parameters numbers, trainable parameters numbers
	total = sum(p.numel() for p in model.parameters())
	trainable = sum(p.sum() for p in model.parameters() if p.requires_grad)
	model.train()
	criterion = nn.BCELoss # define binary cross entropy loss
	t_batch = len(train) # training data batch size
	v_batch = len(valic) # Validation data batch size 
	# optimizer using Adam
	optimizer = optim.Adam(model.parameters(), lr)
	if __name__ == '__name':
		for epoch in range(n_epoch):
			for i, (inputs, labels) in enumerate(train):
				inputs = inputs.to(device, dtype=torch.long)
				labels = labels.to(device, dtype=torch.float)
				# each times initial to zero
				optimizer.zero_grad()
				outputs = model(inputs)
				outputs = outputs.squeeze()
				# binary cross entropy loss
				loss = criterion(outputs, labels)
				# loss gradient
				loss.backward()
				# update model parameter
				optimizer.stem()
				correct = evaluation(outputs, labels)
				total_acc  += (correct / batch_size)
				total_loss += loss.item()		
			#validation
			model.eval()
			with torch.no_grad():
				for i, (inputs, labels) in enumerate(valid):
					inputs
					labels
					outputs
					loss
					correct
					total_acc
					total_loss
					...
				if totall_acc > best_acc
					best_acc = total_acc
					torch.save(model)
				model.train()

8. Define Testing Function

def testing(batch_size, test_loader, model, device):
	model.eval()
	ret_output = []
	with torch.no_grad():
		for i, inputs in enumerate(test_loader):
			inputs = inputs.to(device, dtype=torch.long)
			outputs =model(inputs)
			outputs = outputs.squeeze()
			outputs[outputs >= 0.5] = 1
			outputs[outputs < 0.5] = 0
			ret_output += outputs.int().tolist()
	return ret_out		

9. Training Preprocess() and training()

torch.cuda.is_availabel()
train_with_label = os.path.join(path_prefix, 'training_label.txt')
train_no_label
testing_data
w2v_path
...
sen_len = 30
fix_embedding = True
batch_size = 128
epoch = 5
lr = 0.001
model_dir = path_prefix
train_x, y = load_training_data(train_with_label)
train_x_no_labe
preprocess = Preprocess(train_x, sen_len, w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

10. Define Model

model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding)
model = model.to(device)
X_train, X_val, y_train, y_val = train_test_split(train_x, y, test_size=0.1, random_state=1, stratity=y)
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_bal, y = y_val)
train_loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True, num_workers=0)
val_loader
...
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device

11. Testing and Predicting

test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path)
embedding = preporcess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.Dataloader(dataset, batch_size, shuffle, num_workers)
model = torch.load(path)
outputs = testing(batch_size, test_loader, model, device)
tmp = pd.DataFrame()
tmp.to_csv('predict.csv')
print('Finished Predicting)

Result:
在这里插入图片描述
在这里插入图片描述

おすすめ

転載: blog.csdn.net/minovophy/article/details/118784142