エンティティ認識(4)-バートに基づく製品タイトルのエンティティ認識[非常に詳細]

バートに基づくエンティティ認識タスクの微調整

素晴らしい、ChallengeHubパブリックアカウント、WeChat:1185918903、NLP技術交換に注意

Heywhaleのホームページ:https ://www.heywhale.com/home/user/profile/58f387e7a686fb29e425d133

必要なpipパッケージ

#!pip install transformers seqeval[gpu]

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda

情報処理

競合データのダウンロードアドレス:製品タイトルエンティティ認識 https://www.heywhale.com/home/competition/620b34ed28270b0017b823ad

pd.DataFrame([[1,2,3],
             [4,5,6]])

| | 0 | 1 | 2 |


with open('train_500.txt','r',encoding='utf-8') as f:
    tmp=[]
    cnt=1
    for line in tqdm(f.read().split('\n')):
        sentence_id=f'train_{cnt}'
        # print(line)
        if line!='\n' and len(line.strip())>0:
            word_tags=line.split(' ')
            if len(word_tags)==2:
                tmp.append([sentence_id]+word_tags)
            elif len(word_tags)==2:
                word=' '.join(word_tags[:-1])
                tag=word_tags[-1]
                tmp.append([sentence_id,word,tag])
        else:
            cnt+=1

100%|████████████████████████████████████████████████████████████████████████| 28307/28307 [00:00<00:00, 886249.33it/s]
data=pd.DataFrame(tmp,columns=['sentence_id','words','tags'])
data

| | statement_id | 言葉| タグ|

26674行×3列

data[data['sentence_id']=='train_1']

| | statement_id | 言葉| タグ|

65行×3列

data['sentence'] = data[['sentence_id','words','tags']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
data['word_labels'] = data[['sentence_id','words','tags']].groupby(['sentence_id'])['tags'].transform(lambda x: ','.join(x))
data.head()

| | statement_id | 言葉| タグ| 文| word_labels |

data.shape

(26674, 5)
data['sentence_id'].nunique()

501
labels_to_ids = {k: v for v, k in enumerate(data.tags.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.tags.unique())}
labels_to_ids

{'B-40': 0,
 'I-40': 1,
 'B-4': 2,
 'I-4': 3,
 'B-14': 4,
 'I-14': 5,
 'B-5': 6,
 'I-5': 7,
 'B-7': 8,
 'I-7': 9,
 'B-11': 10,
 'I-11': 11,
 'B-13': 12,
 'I-13': 13,
 'B-8': 14,
 'I-8': 15,
 'O': 16,
 'B-16': 17,
 'I-16': 18,
 'B-29': 19,
 'I-29': 20,
 'B-9': 21,
 'I-9': 22,
 'B-12': 23,
 'I-12': 24,
 'B-18': 25,
 'I-18': 26,
 'B-1': 27,
 'I-1': 28,
 'B-3': 29,
 'I-3': 30,
 'B-22': 31,
 'I-22': 32,
 'B-37': 33,
 'I-37': 34,
 'B-39': 35,
 'I-39': 36,
 'B-10': 37,
 'I-10': 38,
 'B-36': 39,
 'I-36': 40,
 'B-34': 41,
 'I-34': 42,
 'B-31': 43,
 'I-31': 44,
 'B-38': 45,
 'I-38': 46,
 'B-54': 47,
 'I-54': 48,
 'B-6': 49,
 'I-6': 50,
 'B-30': 51,
 'I-30': 52,
 'B-15': 53,
 'I-15': 54,
 'B-2': 55,
 'I-2': 56,
 'B-49': 57,
 'I-49': 58,
 'B-21': 59,
 'I-21': 60,
 'B-47': 61,
 'I-47': 62,
 'B-23': 63,
 'I-23': 64,
 'B-20': 65,
 'I-20': 66,
 'B-50': 67,
 'I-50': 68,
 'B-46': 69,
 'I-46': 70,
 'B-41': 71,
 'I-41': 72,
 'B-43': 73,
 'I-43': 74,
 'B-48': 75,
 'I-48': 76,
 'B-19': 77,
 'I-19': 78,
 'B-52': 79,
 'I-52': 80}
len(labels_to_ids)

81
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
# 也可以根据sentence_id去重
data.head()

| | 文| word_labels |

len(data)

501
data.iloc[1].sentence

'牛 皮 纸 袋 手 提 袋 定 制 l o g o 烘 焙 购 物 服 装 包 装 外 卖 打 包 袋 子 礼 品 袋 纸 质 黑 色 3 2 * 1 1 * 2 5 大 横 1 0 0 个'
data.iloc[1].word_labels

'B-4,I-4,I-4,I-4,B-4,I-4,I-4,B-29,I-29,I-29,I-29,I-29,I-29,B-9,I-9,B-5,I-5,B-40,I-40,B-4,I-4,B-40,I-40,B-5,I-5,B-4,I-4,B-4,I-4,I-4,B-12,I-12,B-16,I-16,B-18,I-18,I-18,I-18,I-18,I-18,I-18,I-18,B-13,I-13,B-18,I-18,I-18,I-18'
len(data['sentence'][0].split(' '))

65
data['sentence'].apply(lambda x:len(x.split(' '))).describe()

count    501.000000
mean      53.241517
std       12.810135
min        8.000000
25%       44.000000
50%       53.000000
75%       62.000000
max       91.000000
Name: sentence, dtype: float64

DataLoaderを構築する

MAX_LEN = 91 # 120
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 5
# MODEL_NAME='chinese-roberta-wwm-ext'
MODEL_NAME='hfl/chinese-roberta-wwm-ext'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) # encode_plus()# 整体

NERを実行するBERTのトリッキーな部分は、BERTが単語のトークン化ではなく単語のトークン化に依存していること です。

例:ワシントンのタグは「b-gpe」で、単語のセグメンテーションの後、「Wash」、「## ing」、「## ton」、「b-gpe」、「b-gpe」、「b-gpe」です。

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """

    Word piece tokenization使得很难将词标签与单个subword进行匹配。
    这个函数每次次对每个单词进行一个分词,这样方便为每个subword保留正确的标签。 
    当然,它的处理时间有点慢,但它会帮助我们的模型达到更高的精度。
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # 逐字分词
        tokenized_word = tokenizer.tokenize(word) # id
        n_subwords = len(tokenized_word) # 1

        # 将单个字分词结果追加到句子分词列表
        tokenized_sentence.extend(tokenized_word)

        # 标签同样添加n个subword,与原始word标签一致
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

data.iloc[0]

sentence       手 机 三 脚 架 网 红 直 播 支 架 桌 面 自 拍 杆 蓝 牙 遥 控 三 脚 架 ...
word_labels    B-40,I-40,B-4,I-4,I-4,B-14,I-14,B-5,I-5,B-4,I-...
Name: 0, dtype: object
# tokenize_and_preserve_labels(data.iloc[0]['sentence'],data.iloc[0]['word_labels'],tokenizer)

ここには他の処理方法があります。たとえば、最初のサブワードにのみ元のタグが付けられ、他のサブワードには無関係なタグが付けられます。

# BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding

# https://arxiv.org/abs/1810.04805

encoding_result=tokenizer.encode_plus('这里有其他的处理方式,比如只有第一个subword给定原始标签,其他subword给定一个无关标签')
encoding_result.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
encoding_result

{'input_ids': [101, 6821, 7027, 3300, 1071, 800, 4638, 1905, 4415, 3175, 2466, 8024, 3683, 1963, 1372, 3300, 5018, 671, 702, 11541, 8204, 10184, 5314, 2137, 1333, 1993, 3403, 5041, 8024, 1071, 800, 11541, 8204, 10184, 5314, 2137, 671, 702, 3187, 1068, 3403, 5041, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# tokenizer.convert_ids_to_tokens([101, 6821, 7027, 3300, 1071, 800, 4638, 1905, 4415, 3175, 2466, 8024, 3683, 1963, 1372, 3300, 5018, 671, 702, 11541, 8204, 10184, 5314, 2137, 1333, 1993, 3403, 5041, 8024, 1071, 800, 11541, 8204, 10184, 5314, 2137, 671, 702, 3187, 1068, 3403, 5041, 102])

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # 步骤 1: 对每个句子分词
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # 步骤 2: 添加特殊token并添加对应的标签
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # 给[CLS] token添加O标签
        labels.insert(-1, "O") # 给[SEP] token添加O标签

        # 步骤 3: 截断/填充
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # 截断
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # 填充
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # 步骤 4: 构建attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # 步骤 5: 将分词结果转为词表的id表示
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 

    def __len__(self):
        return self.len

データセットを0.8:0.2の比率に従ってトレーニングセットとテストセットに分割します

from sklearn.model_selection import train_test_split
# train_dataset,test_dataset=train_test_split(data,test_size=0.2,random_state=42)

train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (501, 2)
TRAIN Dataset: (401, 2)
TEST Dataset: (100, 2)

以下は、最初のサンプルのセグメンテーションIDとラベルです。

training_set[0]

{'ids': tensor([ 101, 3345, 2533, 1164, 2137, 1169, 5011, 6381, 3315, 4851, 4665, 1947,
         6163, 7770, 3440, 4851, 1501, 1215, 1062, 6381,  752, 3315, 1555, 1218,
         1062, 1385, 6843, 4851, 3136, 2360, 5688, 4851, 4289,  143,  126, 1217,
         1331, 2339,  868,  833, 6379, 5011, 6381, 3315, 2094, 2137,  976,  143,
          126, 5273, 5682,  523, 1285, 5277, 5436, 4667,  163, 4669, 4851, 4665,
          524,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'targets': tensor([16, 27, 28, 28, 19, 20,  2,  3,  3,  2,  3, 12, 13,  4,  5,  2,  3,  6,
          7,  2,  3,  3,  4,  5,  8,  9,  6,  7,  2,  3,  3,  3,  3, 25, 26, 12,
         13,  6,  7,  6,  7,  2,  3,  3,  3, 19, 20, 25, 26, 17, 18, 16, 16, 16,
         12, 13,  2,  3,  2,  3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
         16])}
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
  print('{0:10}  {1}   {2}'.format(token, label,ids_to_labels[label.numpy().tolist()]))

[CLS]       16   O
杰           27   B-1
得           28   I-1
利           28   I-1
定           19   B-29
制           20   I-29
笔           2   B-4
记           3   I-4
本           3   I-4
礼           2   B-4
盒           3   I-4
套           12   B-13
装           13   I-13
高           4   B-14
档           5   I-14
礼           2   B-4
品           3   I-4
办           6   B-5
公           7   I-5
记           2   B-4
事           3   I-4
本           3   I-4
商           4   B-14
务           5   I-14
公           8   B-7
司           9   I-7
送           6   B-5
礼           7   I-5
教           2   B-4
师           3   I-4
节           3   I-4
礼           3   I-4
物           3   I-4
a           25   B-18
5           26   I-18
加           12   B-13
厚           13   I-13
工           6   B-5
作           7   I-5
会           6   B-5
议           7   I-5
笔           2   B-4
记           3   I-4
本           3   I-4
子           3   I-4
定           19   B-29
做           20   I-29
a           25   B-18
5           26   I-18
红           17   B-16
色           18   I-16
【           16   O
升           16   O
级           16   O
翻           12   B-13
盖           13   I-13
u           2   B-4
盘           3   I-4
礼           2   B-4
盒           3   I-4
】           16   O
[SEP]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O
[PAD]       16   O

PytorchのDataLoaderを作成する

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

ネットワークを定義する

  • モデル構造:BertForTokenClassification

  • 事前トレーニング済みの重み:「bert-base-uncased」

len(labels_to_ids)

81
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))
model.to(device)

モデルをトレーニングする

ids.shape

torch.Size([1, 91])
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0) # 真实标签
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets) # 输出有两个:一个为loss和一个为logits
initial_loss = outputs[0]
initial_loss

tensor(4.5096, device='cuda:0', grad_fn=<NllLossBackward>)

モデルの出力ロジットのサイズは(batch_size、sequence_length、num_labels)です。

tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 91, 81])

オプティマイザーアダムを設定します

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# 训练函数
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # 将model设置为train模式
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long) #(4,91)
        mask = batch['mask'].to(device, dtype = torch.long) #(4,91)
        targets = batch['targets'].to(device, dtype = torch.long)#(4,91)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs[0],outputs[1]
        # print(outputs.keys())
        # print(loss)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 50==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 50 training steps: {loss_step}")

        # 计算准确率
        flattened_targets = targets.view(-1) # 真实标签 大小 (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # 模型输出shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # 取出每个token对应概率最大的标签索引 shape (batch_size * seq_len,)
        # MASK:PAD
        active_accuracy = mask.view(-1) == 1 # shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # 梯度剪切
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # loss反向求导
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

モデルをトレーニングする

for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 50 training steps: 4.573911666870117
Training loss per 50 training steps: 3.5836149757983637
Training loss per 50 training steps: 3.146424697177245
Training loss epoch: 3.146424697177245
Training accuracy epoch: 0.28337175397646347
Training epoch: 2
Training loss per 50 training steps: 2.3866159915924072
Training loss per 50 training steps: 2.211251039131015
Training loss per 50 training steps: 2.0536219070453456
Training loss epoch: 2.0536219070453456
Training accuracy epoch: 0.49648706430276834
Training epoch: 3
Training loss per 50 training steps: 1.8235304355621338
Training loss per 50 training steps: 1.6210375042522656
Training loss per 50 training steps: 1.5436867876808242
Training loss epoch: 1.5436867876808242
Training accuracy epoch: 0.6369489455144468
Training epoch: 4
Training loss per 50 training steps: 1.3719302415847778
Training loss per 50 training steps: 1.254675311200759
Training loss per 50 training steps: 1.2525309105910878
Training loss epoch: 1.2525309105910878
Training accuracy epoch: 0.7013529778539404
Training epoch: 5
Training loss per 50 training steps: 1.2091379165649414
Training loss per 50 training steps: 1.0707006524590885
Training loss per 50 training steps: 1.0643499292949639
Training loss epoch: 1.0643499292949639
Training accuracy epoch: 0.7417508051186237

評価モデル

検証セットの評価

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            # loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=targets)
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs[0],outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # 计算准确率
            flattened_targets = targets.view(-1) # 大小 (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size * seq_len,)
            active_accuracy = mask.view(-1) == 1 # 大小 (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.8113014698028564
Validation Loss: 1.1529839837551117
Validation Accuracy: 0.7087672360508763
# len(predictions),len(labels)

tmp=[]
for tags in data['word_labels']:
    tmp.extend(tags.split(','))
pd.Series(tmp).value_counts()

I-4     3856
O       2970
B-4     2061
I-18    1777
I-38    1487
        ... 
I-48       1
I-23       1
B-23       1
B-52       1
B-46       1
Length: 81, dtype: int64
ids_to_labels[18]

'I-16'
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions])) # [] 避免报错TypeError: Found input variables without list of list.

precision    recall  f1-score   support

           1       0.65      0.72      0.69        68
          10       0.00      0.00      0.00        24
          11       0.67      0.71      0.69       145
          12       0.38      0.38      0.38        21
          13       0.41      0.58      0.48       137
          14       0.57      0.90      0.70        51
          15       0.00      0.00      0.00         5
          16       0.68      0.72      0.70        78
          18       0.48      0.52      0.50       157
          19       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         4
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00        13
          29       0.00      0.00      0.00        13
           3       0.13      0.20      0.16        25
          30       0.00      0.00      0.00         2
          34       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         2
          37       0.34      0.56      0.42        34
          38       0.28      0.40      0.33        82
          39       0.00      0.00      0.00        10
           4       0.68      0.79      0.73       417
          40       0.51      0.56      0.54       108
          46       0.00      0.00      0.00         1
          47       0.00      0.00      0.00         2
           5       0.49      0.68      0.57        81
          50       0.00      0.00      0.00         2
          54       0.50      0.57      0.53        14
           6       0.00      0.00      0.00        10
           7       0.69      0.90      0.78        59
           8       0.69      0.83      0.76        41
           9       0.20      0.04      0.06        27

   micro avg       0.54      0.62      0.58      1636
   macro avg       0.26      0.31      0.28      1636
weighted avg       0.53      0.62      0.57      1636

F:\ProgramData\Anaconda3\lib\site-packages\seqeval\metrics\v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

予測する

''.join(data.iloc[0]['sentence'].split())

'手机三脚架网红直播支架桌面自拍杆蓝牙遥控三脚架摄影拍摄拍照抖音看电视神器三角架便携伸缩懒人户外支撑架【女神粉】自带三脚架+蓝牙遥控'
sentence = "手机三脚架网红直播支架桌面自拍杆蓝牙遥控三脚架摄影拍摄拍照抖音看电视神器三角架便携伸缩懒人户外支撑架【女神粉】自带三脚架+蓝牙遥控"

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# 加载到gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# 输入到模型
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size*seq_len,) 

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# 拼接文本
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

手 机 三 脚 架 网 红 直 播 支 架 桌 面 自 拍 杆 蓝 牙 遥 控 三 脚 架 摄 影 拍 摄 拍 照 抖 音 看 电 视 神 器 三 角 架 便 携 伸 缩 懒 人 户 外 支 撑 架 【 女 神 粉 】 自 带 三 脚 架 + 蓝 牙 遥 控
['B-40', 'I-40', 'B-4', 'I-4', 'I-4', 'B-14', 'I-8', 'B-5', 'I-5', 'B-4', 'I-4', 'B-7', 'I-7', 'B-4', 'I-4', 'I-4', 'B-11', 'I-11', 'B-11', 'I-11', 'B-4', 'I-4', 'I-4', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'B-5', 'I-5', 'I-5', 'O', 'O', 'B-4', 'I-4', 'I-4', 'B-11', 'I-11', 'B-11', 'I-11', 'B-8', 'I-8', 'B-7', 'I-7', 'B-4', 'I-4', 'I-4', 'O', 'B-8', 'I-8', 'O', 'O', 'B-13', 'I-11', 'B-4', 'I-4', 'I-4', 'O', 'B-11', 'I-11', 'B-11', 'O']

モデルを保存

モデルの語彙、モデルの重み、構成ファイルを保存し、後で使用します from_pretrained()

import os

directory = "./model"

if not os.path.exists(directory):
    os.makedirs(directory)

# 保存tokenizer
tokenizer.save_vocabulary(directory)
# 保存权重和配置文件
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed

他の

def prepare_sentence(sentence, tokenizer, maxlen):    
      # 步骤 1: tokenize the sentence
      tokenized_sentence = tokenizer.tokenize(sentence)

      # 步骤 2: add special tokens 
      tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] 

      # 步骤 3: truncating/padding
      if (len(tokenized_sentence) > maxlen):
        # truncate
        tokenized_sentence = tokenized_sentence[:maxlen]
      else:
        # pad
        tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]

      # 步骤 4: obtain the attention mask
      attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

      # 步骤 5: convert tokens to input ids
      ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)

      return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
      }

# Bert:
- Bert CRF
- Bert BiLSTM+CRF
- Lex-Bert
- FLat-NER:FLAT: Chinese NER Using Flat-Lattice Transformer
- Unified Named Entity Recognition as Word-Word Relation Classification
  https://github.com/ljynlp/W2NER
# 数据

- 数据增强:https://github.com/425776024/nlpcda
- 语义增强:embedding 拼音 偏旁 
- 伪标签学习

おすすめ

転載: blog.csdn.net/yanqianglifei/article/details/123367285