携手创作,共同成长!这是我参与「掘金日新计划 · 8 月更文挑战」的第16天,点击查看活动详情
预训练模型下载方式:
1.git下载
git lfs install
git clone https://huggingface.co/hfl/chinese-roberta-wwm-ext
# if you want to clone without large files – just their pointers
# prepend your git clone with the following env var:
GIT_LFS_SKIP_SMUDGE=1
复制代码
2.import各类预训练模型
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = AutoModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
复制代码
# from transformers import BertTokenizer,BertModelForMaskedLM
# tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
# model = BertModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
复制代码
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("ckiplab/albert-tiny-chinese")
model = AutoModelForMaskedLM.from_pretrained("ckiplab/albert-tiny-chinese")
复制代码
Downloading: 0%| | 0.00/174 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/729 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/107k [00:00<?, ?B/s]
Downloading: 0%| | 0.00/112 [00:00<?, ?B/s]
Downloading: 0%| | 0.00/15.4M [00:00<?, ?B/s]
复制代码
3.导入包
!pip install transformers
复制代码
from transformers import AutoConfig,AutoModel,AutoTokenizer,AdamW,get_linear_schedule_with_warmup,logging
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset,SequentialSampler,RandomSampler,DataLoader
# from transformers import AutoConfig,AutoModel,AutoTokenizer
复制代码
# 预训练模型名称
MODEL_NAME="bert-base-chinese"
# MODEL_NAME="roberta-large"
复制代码
4.配置
# 预训练模型配置
config = AutoConfig.from_pretrained(MODEL_NAME)
复制代码
config
复制代码
BertConfig {
"_name_or_path": "bert-base-chinese",
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"position_embedding_type": "absolute",
"transformers_version": "4.17.0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
复制代码
config.num_labels=12
复制代码
# config
复制代码
type(config)
复制代码
transformers.models.bert.configuration_bert.BertConfig
复制代码
5.tokenizer
参考文档:huggingface.co/transformer…
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer
复制代码
PreTrainedTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
复制代码
一些特殊符号:['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
tokenizer.all_special_ids
复制代码
[100, 102, 0, 101, 103]
复制代码
tokenizer.all_special_tokens
复制代码
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
复制代码
# tokenizer.vocab
复制代码
# 词汇表大小
tokenizer.vocab_size
复制代码
21128
复制代码
6.将文本转为词汇表id
- 方法1
def encode(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> List[int]:
"""
Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args:
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
method).
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the ``tokenize`` method) or a list of integers (tokenized string ids using the
``convert_tokens_to_ids`` method).
"""
复制代码
text="我在北京工作"
token_ids=tokenizer.encode(text)
token_ids
复制代码
[101, 2769, 1762, 1266, 776, 2339, 868, 102]
复制代码
type(token_ids)
复制代码
list
复制代码
# 将id转为原始字符
tokenizer.convert_ids_to_tokens(token_ids)
复制代码
['[CLS]', '我', '在', '北', '京', '工', '作', '[SEP]']
复制代码
padding的模式
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
复制代码
# 加入参数
token_ids=tokenizer.encode(text,padding=True,max_length=30,add_special_tokens=True)
token_ids
复制代码
[101, 2769, 1762, 1266, 776, 2339, 868, 102]
复制代码
# 加入参数
token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True)
token_ids
复制代码
[101,
2769,
1762,
1266,
776,
2339,
868,
102,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0]
复制代码
token_ids=tokenizer.encode(text,padding="max_length",max_length=30,add_special_tokens=True,return_tensors='pt')
token_ids
复制代码
tensor([[ 101, 2769, 1762, 1266, 776, 2339, 868, 102, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]])
复制代码
- 方法2 encode_plus
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
复制代码
token_ids=tokenizer.encode_plus(
text,padding="max_length",
max_length=30,
add_special_tokens=True,
return_tensors='pt',
return_token_type_ids=True,
return_attention_mask=True
)
token_ids
复制代码
{'input_ids': tensor([[ 101, 2769, 1762, 1266, 776, 2339, 868, 102, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]])}
复制代码
7.模型加载
model=AutoModel.from_pretrained(MODEL_NAME)
复制代码
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
复制代码
model
复制代码
BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(21128, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
复制代码
# outputs=model(token_ids['input_ids'],token_ids['token_type_ids'])
outputs=model(token_ids['input_ids'],token_ids['attention_mask'])
# outputs=model(token_ids['input_ids'],token_ids['attention_mask'],token_ids['token_type_ids'])
复制代码
outputs
复制代码
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2670, -0.0858, 0.2122, ..., -0.0070, 0.9425, -0.3466],
[ 0.5193, -0.3700, 0.4482, ..., -1.0237, 0.7864, -0.1775],
[-0.1792, -0.7018, 1.0653, ..., -0.3034, 1.0692, 0.0429],
...,
[-0.0568, -0.1166, 0.2944, ..., -0.1114, 0.0260, -0.2406],
[-0.2842, 0.0047, 0.4074, ..., -0.0445, -0.1530, -0.2477],
[ 0.0038, -0.0741, 0.2955, ..., -0.2048, 0.0951, -0.2106]]],
grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[ 0.9986, 0.9999, 0.9988, 0.9545, -0.6417, 0.5586, 0.3451, 0.6832, 0.9936, -0.9965, 1.0000, 0.9999, 0.0969, -0.9015, 0.9994, -0.9996, -0.0634, 1.0000, 0.9828, 0.5460, 0.9992, -1.0000, -0.9602, -0.9486, -0.8842, 0.9878, 0.9769, 0.0949, -0.9995, 0.9895, 0.9659, 0.9994, 0.9980, -0.9999, -0.9976, 0.5098, -0.7977, 0.9948, -0.7914, -0.9849, -0.9965, -0.5981, 0.3857, -0.9975, -0.9579, 0.4100, -1.0000, -1.0000, 0.8568, 0.9991, -0.1765, -1.0000, 0.9296, -0.9318, 0.8056, 0.9725, -0.9998, 0.8912, 1.0000, 0.3592, 0.9997, -0.7306, -0.6022, -0.9998, 1.0000, -0.9999, -0.9528, 0.1521, 0.9995, 1.0000, -0.9858, 0.4340, 1.0000, 0.9561, -0.7498, 0.9997, -0.9917, 0.6525, -1.0000, -0.5290, 1.0000, 0.9993, -0.9347, 0.8421, -0.9891, -0.9999, -0.9998, 0.9999, -0.5783, 0.8760, 0.9945, -0.9977, -1.0000, 0.9980, -0.9983, -0.9987, -0.8752, 0.9981, -0.3967, -0.8975, -0.5133, 0.9742, -0.9992, -0.9991, 0.9994, 0.9994, 0.7277, -0.9995, 0.9999, 0.7907, -1.0000, -0.9462, -1.0000, 0.2198, -0.9616, 0.9996, 0.4455, -0.3929, 0.9995, -0.9991, 0.7031, -0.9999, -0.7935, -0.9974, 0.9999, 0.9999, 0.9985, -0.9997, 0.9998, 1.0000, 0.9194, 0.9896, -0.9930, 0.9952, -0.1052, -0.9834, 0.7176, -0.9664, 1.0000, 0.9652, 0.9823, -0.9853, 0.9957, -0.9980, 0.9999, -1.0000, 0.9945, -1.0000, -0.9994, 0.9953, 0.9923, 1.0000, -0.7978, 0.9999, -0.9812, -0.9999, 0.9990, -0.0079, 0.9991, -0.9999, 0.9872, 0.8773, -0.8599, 0.7851, -1.0000, 0.9999, -0.8774, 1.0000, 0.9998, -0.8900, -0.9732, -0.9988, 0.9746, -0.9995, -0.9984, 0.9864, -0.3062, 0.9885, -0.9927, -0.9211, 0.7024, -0.8854, -0.9998, 0.9979, -0.1070, -0.2068, 0.6250, 0.8880, 0.9973, 0.9898, -0.7060, 0.9999, -0.0964, 0.9962, 0.9989, -0.0794, -0.7561, -0.9706, -1.0000, 0.3083, 0.9999, -0.7450, -0.9987, 0.9098, -1.0000, 0.9353, -0.2246, 0.5185, -0.9900, -0.9999, 0.9999, -0.9718, -0.9958, 0.6067, -0.9118, 0.3253, -1.0000, 0.9202, 0.9909, -0.8688, 0.5344, -0.7166, -0.9953, 0.9309, -0.8199, 0.9348, 0.9977, 1.0000, 0.9804, -0.7467, -0.9335, 1.0000, 0.5077, -1.0000, 0.5815, -0.7935, -0.7349, 0.9998, -0.9990, 0.9095, 1.0000, 0.9921, 1.0000, -0.2125, -0.9989, -0.9970, 1.0000, 0.9978, 0.9998, -0.9985, -0.9991, 0.6060, -0.1385, -1.0000, -0.9962, -0.8801, 0.9911, 1.0000, 0.2897, -0.9998, -0.2624, -0.9993, 1.0000, -0.8487, 1.0000, 0.9556, -0.8725, -0.9962, 0.8722, -0.5077, -0.9997, -0.2779, -0.9996, -0.9924, -0.9999, 0.9055, -0.9990, -1.0000, 0.8632, 0.9999, 0.9105, -0.9998, 0.9996, 0.9957, -0.9611, -0.9996, 0.9823, -1.0000, 1.0000, -0.9969, 0.6207, -0.0030, -0.9880, -0.8604, 0.9991, 0.9997, -0.9974, -0.9256, -0.8272, -0.9999, -0.7311, 0.8521, 0.0231, 0.7647, -0.9838, -0.9336, 0.8415, -0.9954, -0.9999, -0.9192, 1.0000, -0.4956, 1.0000, 0.4524, 1.0000, 0.9832, -0.9993, 0.9930, 0.8250, -0.5943, -0.7908, -0.9861, 0.8129, 0.2001, 0.5161, -0.9995, 0.9997, 0.9983, 0.9893, 0.9763, 0.3462, -0.4559, 0.9393, -0.9982, 0.9976, -0.9996, -0.7520, 0.9971, 0.9999, 0.9999, 0.7595, -0.8876, 0.9727, -0.9980, 0.9970, -0.9974, 0.9985, -0.9960, 0.9693, -0.7504, -0.9917, 1.0000, 0.9545, -0.6712, 1.0000, -0.9418, 0.9384, 0.9999, 0.9206, 0.9717, 0.6311, 0.9999, -0.9986, -0.9966, -0.9973, -0.9944, -0.9988, -1.0000, 0.4478, -0.9976, -0.9626, -0.9599, 0.5757, -0.0107, -0.7348, 0.0048, 0.0723, 0.8022, -0.9708, 0.2892, 0.9310, -0.9980, -0.9384, -1.0000, -0.9981, 0.9888, 0.9992, -0.9997, 0.9997, -1.0000, -0.9987, 0.9901, 0.2053, -0.5843, 0.9998, -0.9999, 0.9686, 1.0000, 1.0000, 0.9991, 0.9997, -0.9751, -0.9999, -0.9994, -0.9999, -1.0000, -0.9994, 0.7674, 0.7939, -1.0000, -0.9327, 0.9427, 1.0000, 0.9453, -0.9987, 0.8275, -0.9995, -0.9830, 0.9995, -0.6096, -0.9989, 0.9999, -0.1734, 1.0000, -0.8638, 0.9956, 0.9765, 0.7885, 0.9677, -1.0000, 0.7434, 1.0000, 0.5149, -0.9999, -0.5679, -0.9572, -1.0000, -0.1615, 0.9307, 0.9999, -0.9999, -0.6308, -0.9919, 0.3437, 0.9118, 0.9999, 0.9988, 0.8609, 0.3412, 0.9425, 0.1690, 0.9997, 0.4484, -0.9968, 0.9974, -0.2034, 0.5577, -1.0000, 0.9962, 0.4399, 0.9999, 0.9959, 0.6560, -0.9489, -0.9596, 0.9954, 1.0000, -0.9612, 0.9706, -0.9990, -1.0000, -0.9989, -0.0476, -0.7789, -0.9785, -0.9992, 0.8798, 0.9559, 1.0000, 0.9999, 0.9957, -0.7819, -0.9561, 0.9869, 0.0119, 0.9998, -0.7133, -1.0000, -0.9949, -0.9999, 0.9996, -0.9068, -0.9097, -0.9300, -0.3992, 0.8845, -0.9999, -0.8416, -0.9979, 0.4116, 1.0000, -0.9875, 0.9986, -0.9986, -0.0395, 0.7331, 0.9024, 0.9995, -0.5490, -0.6971, -0.7122, 0.8567, 0.9874, 0.9989, -0.9868, 0.8329, 0.9981, -0.9835, 0.9991, 0.6488, 0.7209, 0.9834, 1.0000, 0.3964, 0.9979, 0.8983, 0.9999, 0.9999, -0.9403, 0.6022, 0.8283, -0.8373, -0.1218, 0.9771, 0.9999, 0.6683, -0.9757, -0.9997, 0.9984, 0.9961, 1.0000, 0.7415, 0.9946, -0.5225, 0.9588, 0.8054, 0.7780, 0.1452, 0.4877, 0.9282, 0.9990, -1.0000, -1.0000, -1.0000, 1.0000, 0.9999, -0.6069, -1.0000, 0.9994, -0.6409, 0.9728, 0.9938, 0.4333, -0.8666, 0.9610, -0.9995, -0.0485, 0.2587, 0.3155, 0.7848, 0.9992, -0.9998, -0.6526, 1.0000, 0.0809, 0.9999, 0.4925, -0.9816, 0.9979, -0.9703, -0.9998, -0.9115, 0.9998, 0.9994, -0.6118, -0.3596, 0.9993, -0.9996, 0.9999, -0.9999, 0.8994, -0.9990, 0.9999, -0.9854, -0.9989, -0.5286, 0.1115, 0.9979, -0.5575, 0.9999, -0.7099, -0.9667, -0.4315, -0.9133, -0.9996, -0.9925, 0.1584, -0.9999, 0.8137, -0.6677, -0.1643, -0.9849, -0.9998, 0.9999, -0.8938, -0.9912, 0.9999, -0.9979, -1.0000, 0.7306, -0.9942, -0.5475, 0.9840, 0.6176, 0.4018, -1.0000, 0.5113, 0.9995, -0.9994, -0.9433, -0.9860, -0.9887, 0.2204, 0.9866, 0.9670, -0.0998, 0.3975, -0.2984, 0.8272, 0.6054, 0.4439, -0.9957, -0.9461, -0.9810, -0.9991, -0.9991, -0.9999, 1.0000, 0.9998, 0.9999, 0.7350, -0.8119, 0.7291, 0.9982, -0.9996, -0.5762, 0.7971, 0.9614, -0.5536, -0.9997, -0.6525, -1.0000, -0.6792, -0.2272, -0.9713, 0.5998, 1.0000, 0.9999, -0.9997, -0.9987, -0.9992, -0.9950, 0.9997, 0.9985, 0.9994, -0.8969, -0.7800, 0.9759, 0.1705, -0.1565, -0.9984, -0.9964, -0.9998, 0.7632, -0.9969, -0.9995, 0.9998, 0.9996, 0.6582, -0.9999, -0.8716, 0.9998, 0.9992, 1.0000, 0.9521, 0.9998, -0.9946, 0.9990, -0.9999, 1.0000, -1.0000, 1.0000, 1.0000, 0.9882, 0.9990, -0.9772, 0.9573, 0.1751, -0.3707, 0.9621, -0.6839, -0.9851, 0.8741, 0.9970, -0.8459, 1.0000, 0.9007, 0.6603, 0.5746, 0.9532, 0.8250, -0.2215, -0.9997, 0.9931, 0.9996, 0.9992, 1.0000, 0.9771, 0.9999, -0.9744, -0.9995, 0.9920, -0.8009, 0.5036, -0.9990, 0.9999, 1.0000, -0.9989, -0.7738, 0.5874, 0.4543, 0.9999, 0.9995, 0.9998, 0.9350, -0.1731, 0.9999, -0.9996, 0.4741, -0.9847, -0.9183, 1.0000, -0.8106, 0.9993, -0.9607, 1.0000, -0.9767, 0.9727, 0.9982, 0.9664, -0.9946, 1.0000, 0.0554, -0.9968, -0.9992, -0.9921, -0.9907, 0.8621]],
grad_fn=<TanhBackward>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)
复制代码
last_hidden_state=outputs[0]
outputs[0].shape # last_hidden_state
复制代码
torch.Size([1, 30, 768])
复制代码
outputs[1].shape # pooler_output # 整个句子的Pooler output
复制代码
torch.Size([1, 768])
复制代码
cls_embeddings=last_hidden_state[:,0] # 第一个字符CLS的embedding表示
last_hidden_state[:,0].shape
复制代码
torch.Size([1, 768])
复制代码
复制代码
8.对Bert输出进行变换
config
复制代码
BertConfig {
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"position_embedding_type": "absolute",
"transformers_version": "4.6.0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
复制代码
config.update({
'output_hidden_states':True
})
复制代码
model=AutoModel.from_pretrained(MODEL_NAME,config=config)
outputs=model(token_ids['input_ids'],token_ids['token_type_ids'])
复制代码
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
复制代码
outputs.keys()
复制代码
odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])
复制代码
outputs['last_hidden_state'].shape
复制代码
torch.Size([1, 30, 768])
复制代码
outputs['pooler_output'].shape
复制代码
torch.Size([1, 768])
复制代码
len(outputs['hidden_states'])
复制代码
13
复制代码
outputs['hidden_states'][-1].shape
复制代码
torch.Size([1, 30, 768])
复制代码
复制代码