Coggle 30 Days of ML (July 23) Task Ten: Use Bert to complete pre-training in the competition data set
Task 10: Use Bert to complete pre-training in the competition data set
- Description: In this task, you will use the Bert model to complete pre-training on the competition dataset, and extract text features through the pre-trained Bert model.
- Practice steps:
- Prepare the competition dataset and corresponding pre-training parameters.
- Use the Bert model in the transformer library to load the pre-trained parameters.
- Use the Bert model to pre-train the competition dataset and extract text features.
Load and train the model
In task nine, we have already introduced that there are actually some parameters about the bert model in the transformer library, so we can load the pre-trained parameters
Specific model selection can refer to: https://huggingface.co/transformers/v3.0.2/pretrained_models.html?highlight=pretrained
# 加载预训练的BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)
define dataset
Next, use tokenizer
it to extract features from the data, which designs the feature extraction, and finally returns input_ids
, attention_mask
andlabel
# 定义自定义数据集类
class CustomDataset(Dataset):
def __init__(self, texts, labels, tokenizer):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts.iloc[idx]
# print(text)
label = self.labels.iloc[idx]
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
padding='max_length',
max_length=128,
truncation=True,
return_tensors='pt'
)
return inputs['input_ids'].to(device), inputs['attention_mask'].to(device), torch.tensor(label).to(device)
Define training and validation functions
Define the training and verification functions. The training function sets the model to training mode and uses the AdamW optimizer to update the model parameters. The verification function sets the model to the evaluation mode and calculates the loss and accuracy on the verification data set.
# 定义训练和验证函数
def train(model, train_loader):
"""
训练模型的函数
Args:
model: 当前的模型
train_loader: 训练数据集的DataLoader
Returns:
无
"""
# 将模型设置为训练模式
model.train()
# 定义优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# 遍历训练数据集
for batch in tqdm(train_loader, desc='Training'):
# 获取数据
input_ids, attention_mask, labels = batch
# 数据转移到GPU上
input_ids = input_ids.squeeze().to(device)
attention_mask = attention_mask.squeeze().to(device)
labels = labels.to(device)
# 将梯度缓存归零
optimizer.zero_grad()
# 前向传播
outputs = model(input_ids=input_ids.squeeze(),
attention_mask=attention_mask.squeeze(),
labels=labels)
# 计算损失
loss = outputs.loss
# 反向传播
loss.backward()
# 更新模型参数
optimizer.step()
def evaluate(model, val_loader):
"""
验证模型的函数
Args:
model: 当前的模型
val_loader: 验证数据集的DataLoader
Returns:
val_loss: 验证数据集上的损失
accuracy: 模型在验证数据集上的准确率
"""
# 将模型设置为评估模式
model.eval()
val_loss = 0
correct = 0
total = 0
# 禁用梯度计算
with torch.no_grad():
# 遍历验证数据集
for batch in tqdm(val_loader, desc='Evaluating'):
# 获取数据
input_ids, attention_mask, labels = batch
# 数据转移到GPU上
input_ids = input_ids.squeeze().to(device)
attention_mask = attention_mask.squeeze().to(device)
labels = labels.to(device)
# 前向传播
outputs = model(input_ids=input_ids.squeeze(),
attention_mask=attention_mask.squeeze(),
labels=labels)
# 计算损失
val_loss += outputs.loss.item()
# 获取预测结果
_, predicted = torch.max(outputs.logits, dim=1)
# 统计预测正确的数量
total += labels.size(0)
correct += (predicted == labels).sum().item()
# 计算准确率
accuracy = correct / total
return val_loss, accuracy
Train and validate the model
Use the Bert model to pre-train the competition dataset, extract text features, train for 10 epochs, verify the model and save the model with the highest accuracy.
# 训练和验证
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 将模型移动到GPU上
model.to(device)
num_epochs = 10
best_accuracy = 0
# 开始训练
for epoch in range(num_epochs):
# 训练模型
train(model, train_loader)
# 验证模型
val_loss, accuracy = evaluate(model, val_loader)
# 打印当前模型的验证损失和准确率
print(f'Epoch {
epoch+1}: Validation Loss = {
val_loss:.4f}, Accuracy = {
accuracy:.4f}')
# 如果当前模型的准确率更高,则保存当前模型
if accuracy > best_accuracy:
best_accuracy = accuracy
torch.save(model.state_dict(), 'bert_model.pth') # 保存最佳模型
model prediction
Load the best model for prediction and save the result as a CSV file.
# 加载最佳模型并进行预测
model.load_state_dict(torch.load('bert_model.pth'))
# 将模型移动到GPU上
model.to(device)
# 将模型设置为评估模式
model.eval()
# 加载测试数据
test_texts = test_data['content']
test_labels = np.zeros(test_data.shape[0])
test_dataset = CustomDataset(test_texts, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
# 预测结果
predictions = []
with torch.no_grad():
for batch in tqdm(test_loader, desc='Predicting'):
input_ids, attention_mask = batch
outputs = model(input_ids=input_ids.squeeze(),
attention_mask=attention_mask.squeeze())
_, predicted = torch.max(outputs.logits, dim=1)
predictions.extend(predicted.tolist())
# 保存预测结果
submit = pd.read_csv('sample_submit.csv')
submit['label'] = predictions
submit.to_csv('bert_predictions.csv', index=None)