Install module
!pip install tokenizers==0.13.3 torch==2.0.1 transformers==4.30.2
!pip install accelerate -U
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from transformers import pipeline, set_seed
from transformers import GPT2TokenizerFast
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
Zero, data preparation
import requests
from bs4 import BeautifulSoup
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.37'
}
page_text_res = requests.get(url=url, headers=headers)
page_text_res.encoding = 'utf-8'
page_text = page_text_res.text
soup = BeautifulSoup(page_text, 'lxml')
select_list = soup.select('.book-mulu > ul > li > a')
url_assist = 'https://www.shicimingju.com'
fp = open('./sanguoyanyi.txt', 'w', encoding='utf-8')
for select in select_list:
title = select.string
chapter_url = url_assist + select['href']
res = requests.get(url=chapter_url, headers=headers)
res.encoding = 'utf-8'
text = res.text
chapter_soup = BeautifulSoup(text, 'lxml')
chapter_content = chapter_soup.find('div', class_='chapter_content').text
fp.write(title+':'+chapter_content+'\n')
# print(title, "爬取成功")
# print(select.string, '链接:', url_assist + select['href'], '爬取成功')
print('over')
over
1. Training the word segmenter
Tokenization is the process of dividing input text into meaningful subunits (tokens). Create a new tokenizer based on our data with the following code:
tokenizer = Tokenizer(BPE(unk_token=""))
tokenizer.normalizer = Sequence([NFKC()])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
special_tokens = ["","","","","
('./sanguo/tokenizer_config.json',
'./sanguo/special_tokens_map.json',
'./sanguo/vocab.json',
'./sanguo/merges.txt',
'./sanguo/added_tokens.json',
'./sanguo/tokenizer.json')
2. Training model
tokenizer = GPT2Tokenizer.from_pretrained("./sanguo")
tokenizer.add_special_tokens({
"eos_token": "</s>",
"bos_token": "<s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>"
})
# 配置GPT2模型参数
config = GPT2Config(
vocab_size=tokenizer.vocab_size,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id
)
# 创建模型
model = GPT2LMHeadModel(config)
# 训练数据我们用按行分割
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="./sanguoyanyi.txt",
block_size=128,
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False, mlm_probability=0.15
)
# 配置训练参数
training_args = TrainingArguments(
output_dir="./output",
overwrite_output_dir=True,
num_train_epochs=20,
per_gpu_train_batch_size=16,
save_steps=2000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
trainer.train()
# 保存模型
model.save_pretrained('./sanguo')
3. test model
generator = pipeline('text-generation', model='./sanguo')
set_seed(13)
txt = generator("吕布", max_length=10)
print(txt)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting pad_token_id
to eos_token_id
:2 for open-end generation.
[{‘generated_text’: ‘吕布回·曹操怒�\xa0却说姜维’}]