transformers配置、模型加载实践

携手创作,共同成长!这是我参与「掘金日新计划 · 8 月更文挑战」的第16天,点击查看活动详情

预训练模型下载方式:

1.git下载

git lfs install
git clone https://huggingface.co/hfl/chinese-roberta-wwm-ext
# if you want to clone without large files – just their pointers
# prepend your git clone with the following env var:
GIT_LFS_SKIP_SMUDGE=1
复制代码

2.import各类预训练模型


from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")

model = AutoModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")

复制代码
# from transformers import BertTokenizer,BertModelForMaskedLM

# tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")

# model = BertModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
复制代码
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("ckiplab/albert-tiny-chinese")

model = AutoModelForMaskedLM.from_pretrained("ckiplab/albert-tiny-chinese")
复制代码
Downloading:   0%|          | 0.00/174 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/729 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/15.4M [00:00<?, ?B/s]
复制代码

3.导入包

!pip install transformers
复制代码
from transformers import AutoConfig,AutoModel,AutoTokenizer,AdamW,get_linear_schedule_with_warmup,logging
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset,SequentialSampler,RandomSampler,DataLoader

# from transformers import AutoConfig,AutoModel,AutoTokenizer
复制代码
# 预训练模型名称
MODEL_NAME="bert-base-chinese"
# MODEL_NAME="roberta-large"

复制代码

4.配置

#  预训练模型配置
config = AutoConfig.from_pretrained(MODEL_NAME)
复制代码
config
复制代码
BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}
复制代码
config.num_labels=12
复制代码
# config
复制代码
type(config)
复制代码
transformers.models.bert.configuration_bert.BertConfig
复制代码

5.tokenizer

参考文档:huggingface.co/transformer…

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer
复制代码
PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
复制代码

一些特殊符号:['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

tokenizer.all_special_ids
复制代码
[100, 102, 0, 101, 103]
复制代码
tokenizer.all_special_tokens
复制代码
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
复制代码
# tokenizer.vocab
复制代码
# 词汇表大小
tokenizer.vocab_size
复制代码
21128
复制代码

6.将文本转为词汇表id

  • 方法1
    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
    ) -> List[int]:
        """
        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.

        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.

        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
                method).
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the ``tokenize`` method) or a list of integers (tokenized string ids using the
                ``convert_tokens_to_ids`` method).
        """
复制代码
text="我在北京工作"
token_ids=tokenizer.encode(text)
token_ids
复制代码
[101, 2769, 1762, 1266, 776, 2339, 868, 102]
复制代码
type(token_ids)
复制代码
list
复制代码
# 将id转为原始字符
tokenizer.convert_ids_to_tokens(token_ids)
复制代码
['[CLS]', '我', '在', '北', '京', '工', '作', '[SEP]']
复制代码

padding的模式

padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
复制代码
# 加入参数
token_ids=tokenizer.encode(text,padding=True,max_length=30,add_special_tokens=True)
token_ids
复制代码
[101, 2769, 1762, 1266, 776, 2339, 868, 102]
复制代码
# 加入参数
token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True)
token_ids
复制代码
[101,
 2769,
 1762,
 1266,
 776,
 2339,
 868,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]
复制代码
token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True,return_tensors='pt')
token_ids
复制代码
tensor([[ 101, 2769, 1762, 1266,  776, 2339,  868,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]])
复制代码
  • 方法2 encode_plus
def encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
复制代码
token_ids=tokenizer.encode_plus(
    text,padding="max_length",
    max_length=30,
    add_special_tokens=True,
    return_tensors='pt',
    return_token_type_ids=True,
    return_attention_mask=True
)
token_ids
复制代码
{'input_ids': tensor([[ 101, 2769, 1762, 1266,  776, 2339,  868,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])}
复制代码

7.模型加载

model=AutoModel.from_pretrained(MODEL_NAME)
复制代码
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
复制代码
model
复制代码
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (2): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (3): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (4): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (5): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (6): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (7): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (8): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (9): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (10): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (11): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
)
复制代码
# outputs=model(token_ids['input_ids'],token_ids['token_type_ids'])
outputs=model(token_ids['input_ids'],token_ids['attention_mask'])

# outputs=model(token_ids['input_ids'],token_ids['attention_mask'],token_ids['token_type_ids'])

复制代码
outputs
复制代码
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2670, -0.0858,  0.2122,  ..., -0.0070,  0.9425, -0.3466],
         [ 0.5193, -0.3700,  0.4482,  ..., -1.0237,  0.7864, -0.1775],
         [-0.1792, -0.7018,  1.0653,  ..., -0.3034,  1.0692,  0.0429],
         ...,
         [-0.0568, -0.1166,  0.2944,  ..., -0.1114,  0.0260, -0.2406],
         [-0.2842,  0.0047,  0.4074,  ..., -0.0445, -0.1530, -0.2477],
         [ 0.0038, -0.0741,  0.2955,  ..., -0.2048,  0.0951, -0.2106]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[ 0.9986,  0.9999,  0.9988,  0.9545, -0.6417,  0.5586,  0.3451,  0.6832,          0.9936, -0.9965,  1.0000,  0.9999,  0.0969, -0.9015,  0.9994, -0.9996,         -0.0634,  1.0000,  0.9828,  0.5460,  0.9992, -1.0000, -0.9602, -0.9486,         -0.8842,  0.9878,  0.9769,  0.0949, -0.9995,  0.9895,  0.9659,  0.9994,          0.9980, -0.9999, -0.9976,  0.5098, -0.7977,  0.9948, -0.7914, -0.9849,         -0.9965, -0.5981,  0.3857, -0.9975, -0.9579,  0.4100, -1.0000, -1.0000,          0.8568,  0.9991, -0.1765, -1.0000,  0.9296, -0.9318,  0.8056,  0.9725,         -0.9998,  0.8912,  1.0000,  0.3592,  0.9997, -0.7306, -0.6022, -0.9998,          1.0000, -0.9999, -0.9528,  0.1521,  0.9995,  1.0000, -0.9858,  0.4340,          1.0000,  0.9561, -0.7498,  0.9997, -0.9917,  0.6525, -1.0000, -0.5290,          1.0000,  0.9993, -0.9347,  0.8421, -0.9891, -0.9999, -0.9998,  0.9999,         -0.5783,  0.8760,  0.9945, -0.9977, -1.0000,  0.9980, -0.9983, -0.9987,         -0.8752,  0.9981, -0.3967, -0.8975, -0.5133,  0.9742, -0.9992, -0.9991,          0.9994,  0.9994,  0.7277, -0.9995,  0.9999,  0.7907, -1.0000, -0.9462,         -1.0000,  0.2198, -0.9616,  0.9996,  0.4455, -0.3929,  0.9995, -0.9991,          0.7031, -0.9999, -0.7935, -0.9974,  0.9999,  0.9999,  0.9985, -0.9997,          0.9998,  1.0000,  0.9194,  0.9896, -0.9930,  0.9952, -0.1052, -0.9834,          0.7176, -0.9664,  1.0000,  0.9652,  0.9823, -0.9853,  0.9957, -0.9980,          0.9999, -1.0000,  0.9945, -1.0000, -0.9994,  0.9953,  0.9923,  1.0000,         -0.7978,  0.9999, -0.9812, -0.9999,  0.9990, -0.0079,  0.9991, -0.9999,          0.9872,  0.8773, -0.8599,  0.7851, -1.0000,  0.9999, -0.8774,  1.0000,          0.9998, -0.8900, -0.9732, -0.9988,  0.9746, -0.9995, -0.9984,  0.9864,         -0.3062,  0.9885, -0.9927, -0.9211,  0.7024, -0.8854, -0.9998,  0.9979,         -0.1070, -0.2068,  0.6250,  0.8880,  0.9973,  0.9898, -0.7060,  0.9999,         -0.0964,  0.9962,  0.9989, -0.0794, -0.7561, -0.9706, -1.0000,  0.3083,          0.9999, -0.7450, -0.9987,  0.9098, -1.0000,  0.9353, -0.2246,  0.5185,         -0.9900, -0.9999,  0.9999, -0.9718, -0.9958,  0.6067, -0.9118,  0.3253,         -1.0000,  0.9202,  0.9909, -0.8688,  0.5344, -0.7166, -0.9953,  0.9309,         -0.8199,  0.9348,  0.9977,  1.0000,  0.9804, -0.7467, -0.9335,  1.0000,          0.5077, -1.0000,  0.5815, -0.7935, -0.7349,  0.9998, -0.9990,  0.9095,          1.0000,  0.9921,  1.0000, -0.2125, -0.9989, -0.9970,  1.0000,  0.9978,          0.9998, -0.9985, -0.9991,  0.6060, -0.1385, -1.0000, -0.9962, -0.8801,          0.9911,  1.0000,  0.2897, -0.9998, -0.2624, -0.9993,  1.0000, -0.8487,          1.0000,  0.9556, -0.8725, -0.9962,  0.8722, -0.5077, -0.9997, -0.2779,         -0.9996, -0.9924, -0.9999,  0.9055, -0.9990, -1.0000,  0.8632,  0.9999,          0.9105, -0.9998,  0.9996,  0.9957, -0.9611, -0.9996,  0.9823, -1.0000,          1.0000, -0.9969,  0.6207, -0.0030, -0.9880, -0.8604,  0.9991,  0.9997,         -0.9974, -0.9256, -0.8272, -0.9999, -0.7311,  0.8521,  0.0231,  0.7647,         -0.9838, -0.9336,  0.8415, -0.9954, -0.9999, -0.9192,  1.0000, -0.4956,          1.0000,  0.4524,  1.0000,  0.9832, -0.9993,  0.9930,  0.8250, -0.5943,         -0.7908, -0.9861,  0.8129,  0.2001,  0.5161, -0.9995,  0.9997,  0.9983,          0.9893,  0.9763,  0.3462, -0.4559,  0.9393, -0.9982,  0.9976, -0.9996,         -0.7520,  0.9971,  0.9999,  0.9999,  0.7595, -0.8876,  0.9727, -0.9980,          0.9970, -0.9974,  0.9985, -0.9960,  0.9693, -0.7504, -0.9917,  1.0000,          0.9545, -0.6712,  1.0000, -0.9418,  0.9384,  0.9999,  0.9206,  0.9717,          0.6311,  0.9999, -0.9986, -0.9966, -0.9973, -0.9944, -0.9988, -1.0000,          0.4478, -0.9976, -0.9626, -0.9599,  0.5757, -0.0107, -0.7348,  0.0048,          0.0723,  0.8022, -0.9708,  0.2892,  0.9310, -0.9980, -0.9384, -1.0000,         -0.9981,  0.9888,  0.9992, -0.9997,  0.9997, -1.0000, -0.9987,  0.9901,          0.2053, -0.5843,  0.9998, -0.9999,  0.9686,  1.0000,  1.0000,  0.9991,          0.9997, -0.9751, -0.9999, -0.9994, -0.9999, -1.0000, -0.9994,  0.7674,          0.7939, -1.0000, -0.9327,  0.9427,  1.0000,  0.9453, -0.9987,  0.8275,         -0.9995, -0.9830,  0.9995, -0.6096, -0.9989,  0.9999, -0.1734,  1.0000,         -0.8638,  0.9956,  0.9765,  0.7885,  0.9677, -1.0000,  0.7434,  1.0000,          0.5149, -0.9999, -0.5679, -0.9572, -1.0000, -0.1615,  0.9307,  0.9999,         -0.9999, -0.6308, -0.9919,  0.3437,  0.9118,  0.9999,  0.9988,  0.8609,          0.3412,  0.9425,  0.1690,  0.9997,  0.4484, -0.9968,  0.9974, -0.2034,          0.5577, -1.0000,  0.9962,  0.4399,  0.9999,  0.9959,  0.6560, -0.9489,         -0.9596,  0.9954,  1.0000, -0.9612,  0.9706, -0.9990, -1.0000, -0.9989,         -0.0476, -0.7789, -0.9785, -0.9992,  0.8798,  0.9559,  1.0000,  0.9999,          0.9957, -0.7819, -0.9561,  0.9869,  0.0119,  0.9998, -0.7133, -1.0000,         -0.9949, -0.9999,  0.9996, -0.9068, -0.9097, -0.9300, -0.3992,  0.8845,         -0.9999, -0.8416, -0.9979,  0.4116,  1.0000, -0.9875,  0.9986, -0.9986,         -0.0395,  0.7331,  0.9024,  0.9995, -0.5490, -0.6971, -0.7122,  0.8567,          0.9874,  0.9989, -0.9868,  0.8329,  0.9981, -0.9835,  0.9991,  0.6488,          0.7209,  0.9834,  1.0000,  0.3964,  0.9979,  0.8983,  0.9999,  0.9999,         -0.9403,  0.6022,  0.8283, -0.8373, -0.1218,  0.9771,  0.9999,  0.6683,         -0.9757, -0.9997,  0.9984,  0.9961,  1.0000,  0.7415,  0.9946, -0.5225,          0.9588,  0.8054,  0.7780,  0.1452,  0.4877,  0.9282,  0.9990, -1.0000,         -1.0000, -1.0000,  1.0000,  0.9999, -0.6069, -1.0000,  0.9994, -0.6409,          0.9728,  0.9938,  0.4333, -0.8666,  0.9610, -0.9995, -0.0485,  0.2587,          0.3155,  0.7848,  0.9992, -0.9998, -0.6526,  1.0000,  0.0809,  0.9999,          0.4925, -0.9816,  0.9979, -0.9703, -0.9998, -0.9115,  0.9998,  0.9994,         -0.6118, -0.3596,  0.9993, -0.9996,  0.9999, -0.9999,  0.8994, -0.9990,          0.9999, -0.9854, -0.9989, -0.5286,  0.1115,  0.9979, -0.5575,  0.9999,         -0.7099, -0.9667, -0.4315, -0.9133, -0.9996, -0.9925,  0.1584, -0.9999,          0.8137, -0.6677, -0.1643, -0.9849, -0.9998,  0.9999, -0.8938, -0.9912,          0.9999, -0.9979, -1.0000,  0.7306, -0.9942, -0.5475,  0.9840,  0.6176,          0.4018, -1.0000,  0.5113,  0.9995, -0.9994, -0.9433, -0.9860, -0.9887,          0.2204,  0.9866,  0.9670, -0.0998,  0.3975, -0.2984,  0.8272,  0.6054,          0.4439, -0.9957, -0.9461, -0.9810, -0.9991, -0.9991, -0.9999,  1.0000,          0.9998,  0.9999,  0.7350, -0.8119,  0.7291,  0.9982, -0.9996, -0.5762,          0.7971,  0.9614, -0.5536, -0.9997, -0.6525, -1.0000, -0.6792, -0.2272,         -0.9713,  0.5998,  1.0000,  0.9999, -0.9997, -0.9987, -0.9992, -0.9950,          0.9997,  0.9985,  0.9994, -0.8969, -0.7800,  0.9759,  0.1705, -0.1565,         -0.9984, -0.9964, -0.9998,  0.7632, -0.9969, -0.9995,  0.9998,  0.9996,          0.6582, -0.9999, -0.8716,  0.9998,  0.9992,  1.0000,  0.9521,  0.9998,         -0.9946,  0.9990, -0.9999,  1.0000, -1.0000,  1.0000,  1.0000,  0.9882,          0.9990, -0.9772,  0.9573,  0.1751, -0.3707,  0.9621, -0.6839, -0.9851,          0.8741,  0.9970, -0.8459,  1.0000,  0.9007,  0.6603,  0.5746,  0.9532,          0.8250, -0.2215, -0.9997,  0.9931,  0.9996,  0.9992,  1.0000,  0.9771,          0.9999, -0.9744, -0.9995,  0.9920, -0.8009,  0.5036, -0.9990,  0.9999,          1.0000, -0.9989, -0.7738,  0.5874,  0.4543,  0.9999,  0.9995,  0.9998,          0.9350, -0.1731,  0.9999, -0.9996,  0.4741, -0.9847, -0.9183,  1.0000,         -0.8106,  0.9993, -0.9607,  1.0000, -0.9767,  0.9727,  0.9982,  0.9664,         -0.9946,  1.0000,  0.0554, -0.9968, -0.9992, -0.9921, -0.9907,  0.8621]],
       grad_fn=<TanhBackward>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)
复制代码
last_hidden_state=outputs[0]
outputs[0].shape # last_hidden_state
复制代码
torch.Size([1, 30, 768])
复制代码
outputs[1].shape # pooler_output # 整个句子的Pooler output
复制代码
torch.Size([1, 768])
复制代码
cls_embeddings=last_hidden_state[:,0] # 第一个字符CLS的embedding表示
last_hidden_state[:,0].shape
复制代码
torch.Size([1, 768])
复制代码
复制代码

8.对Bert输出进行变换

config
复制代码
BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.6.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}
复制代码
config.update({
            'output_hidden_states':True
            }) 
复制代码
model=AutoModel.from_pretrained(MODEL_NAME,config=config)

outputs=model(token_ids['input_ids'],token_ids['token_type_ids'])
复制代码
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
复制代码
outputs.keys()
复制代码
odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])
复制代码
outputs['last_hidden_state'].shape
复制代码
torch.Size([1, 30, 768])
复制代码
outputs['pooler_output'].shape
复制代码
torch.Size([1, 768])
复制代码
len(outputs['hidden_states'])
复制代码
13
复制代码
outputs['hidden_states'][-1].shape
复制代码
torch.Size([1, 30, 768])
复制代码
复制代码

猜你喜欢

转载自juejin.im/post/7131004306740183076