Chatglm2-6b model related issues
1. Chatglm2-6b model p-tuning post-inference answer does not answer the question
According to ChatGLM-6B station b: [Official tutorial] ChatGLM-6B fine-tuning: P-Tuning, LoRA, Full parameter, which probably means that you forget the front after practicing the latter.
- The essence of solving
the weights of ptuning training is to practice model.transformer.prefix_encoder.
The idea is to mix the old prefix_encoder weights with the weights of ptuning training, and then re-inject them into model.transformer.prefix_encoder.
After my test, the old weight is 0.2 and the new weight is 0.8. In this way, the weighted average model can say hello and retain the ability of ptuning training. However,
this method of averaging mixed weights is more metaphysical and may not be useful
. Script: Solution script
2. ChatGLM2-6b ptuning
- Doka training.bash
```bash
export PRE_SEQ_LEN=128
export LR=2e-2
export NUM_GPUS=1
# export WANDB_MODE=dryrun
export WANDB_DISABLED=true
torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS ptuning/main.py \
--do_train \
--train_file data/AdvertiseGen/train.json \
--validation_file data/AdvertiseGen/dev.json \
--preprocessing_num_workers 4 \
--prompt_column content \
--response_column summary \
--overwrite_cache \
--model_name_or_path /home/geekplusa/ai/models/bigmodels/prepare_models/chatglm/chatglm2-6b-32k-int4 \
--output_dir /home/geekplusa/ai/models/bigmodels/train_models/chatglm/chatglm2/adgen-chatglm2-6b-pt-$PRE_SEQ_LEN-$LR \
--overwrite_output_dir \
--max_source_length 64 \
--max_target_length 256 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--predict_with_generate \
--max_steps 2000 \
--logging_steps 10 \
--save_steps 500 \
--learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN \
--quantization_bit 4
```
- Doka training. Running in the background
```bash
export PRE_SEQ_LEN=128
export LR=2e-2
export NUM_GPUS=4
# export WANDB_MODE=dryrun
export WANDB_DISABLED=true
nohup torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS ptuning/main.py --do_train --train_file data/AdvertiseGen/train_min.json --validation_file data/AdvertiseGen/dev_min.json --preprocessing_num_workers 4 --prompt_column content --response_column summary --overwrite_cache --model_name_or_path models/chatglm2-6b-int4 --output_dir models/adgen-chatglm2-6b-int4-pt-128-2e-e --overwrite_output_dir --max_source_length 64 --max_target_length 256 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 16 --predict_with_generate --max_steps 100 --logging_steps 10 --save_steps 50 --learning_rate $LR --pre_seq_len $PRE_SEQ_LEN --quantization_bit 4 > log 2>&1 &
```
- Single card training
```bash
export WANDB_DISABLED=true
export PRE_SEQ_LEN=128
export LR=2e-2
CUDA_VISIBLE_DEVICES=0 python ptuning/main.py \
--do_train \
--train_file data/AdvertiseGen/train_min.json \
--validation_file data/AdvertiseGen/val_min.json \
--prompt_column content \
--response_column summary \
--overwrite_cache \
--model_name_or_path /home/geekplusa/ai/models/bigmodels/prepare_models/chatglm/chatglm2-6b-int4 \
--output_dir /home/geekplusa/ai/models/bigmodels/train_models/chatglm/chatglm2/adgen-chatglm2-6b-pt-医院1-$PRE_SEQ_LEN-$LR \
--overwrite_output_dir \
--max_source_length 64 \
--max_target_length 1000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--predict_with_generate \
--max_steps 1 \
--logging_steps 10 \
--save_steps 1 \
--learning_rate 2e-2 \
--pre_seq_len 128 \
--quantization_bit 4
```
3. ChatGLM2-6b deployment
This includes basic model single-card deployment, basic model multi-card deployment, ptuning model single-card deployment, and ptuning model multi-card deployment.
See the script below.web_demo2.py
from transformers import AutoModel, AutoTokenizer
from transformers import AutoConfig
import streamlit as st
import os
st.set_page_config(
page_title="ChatGLM2-6b 演示",
page_icon=":robot:",
layout='wide'
)
@st.cache_resource
def get_model_onegpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
model = AutoModel.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True).cuda()
return tokenizer, model
@st.cache_resource
def get_model_mitugpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
from utils import load_model_on_gpus
model = load_model_on_gpus("models/chatglm2-6b-int4", num_gpus=4)
return tokenizer, model
@st.cache_resource
def get_model_ptuning_onegpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
import torch
CHECKPOINT_PATH = "models/gukai/checkpoint-500/"
config = AutoConfig.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True, pre_seq_len=128)
model = AutoModel.from_pretrained("models/chatglm2-6b-int4", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
model = model.quantize(4)
model = model.cuda()
return tokenizer, model
@st.cache_resource
def get_model_ptuning_mutigpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
import torch
from utils import load_model_on_gpus
CHECKPOINT_PATH = "models/gukai/checkpoint-500/"
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
#prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"), map_location=lambda storage, loc: storage.cuda(1))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model = load_model_on_gpus("models/chatglm2-6b-int4", num_gpus=4, pre_seq_len=128)
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
model.transformer.prefix_encoder.float()
model = model.quantize(4)
#model = model.cuda()
return tokenizer, model
@st.cache_resource
def get_model():
tokenizer = AutoTokenizer.from_pretrained("models/gukai/checkpoint-500", trust_remote_code=True)
model = AutoModel.from_pretrained("models/gukai/checkpoint-500", trust_remote_code=True).cuda()
# 多显卡支持,使用下面两行代替上面一行,将num_gpus改为你实际的显卡数量
# from utils import load_model_on_gpus
# model = load_model_on_gpus("THUDM/chatglm2-6b", num_gpus=2)
model = model.eval()
return tokenizer, model
#tokenizer, model = get_model()
tokenizer, model = get_model_ptuning_mutigpu()
st.title("ChatGLM2-6B")
max_length = st.sidebar.slider(
'max_length', 0, 32768, 8192, step=1
)
top_p = st.sidebar.slider(
'top_p', 0.0, 1.0, 0.8, step=0.01
)
temperature = st.sidebar.slider(
'temperature', 0.0, 1.0, 0.8, step=0.01
)
if 'history' not in st.session_state:
st.session_state.history = []
if 'past_key_values' not in st.session_state:
st.session_state.past_key_values = None
for i, (query, response) in enumerate(st.session_state.history):
with st.chat_message(name="user", avatar="user"):
st.markdown(query)
with st.chat_message(name="assistant", avatar="assistant"):
st.markdown(response)
with st.chat_message(name="user", avatar="user"):
input_placeholder = st.empty()
with st.chat_message(name="assistant", avatar="assistant"):
message_placeholder = st.empty()
prompt_text = st.text_area(label="用户命令输入",
height=100,
placeholder="请在这儿输入您的命令")
button = st.button("发送", key="predict")
if button:
input_placeholder.markdown(prompt_text)
history, past_key_values = st.session_state.history, st.session_state.past_key_values
for response, history, past_key_values in model.stream_chat(tokenizer, prompt_text, history,
past_key_values=past_key_values,
max_length=max_length, top_p=top_p,
temperature=temperature,
return_past_key_values=True):
message_placeholder.markdown(response)
st.session_state.history = history
st.session_state.past_key_values = past_key_values