Chatglm2-6b model related issues

1. Chatglm2-6b model p-tuning post-inference answer does not answer the question

According to ChatGLM-6B station b: [Official tutorial] ChatGLM-6B fine-tuning: P-Tuning, LoRA, Full parameter, which probably means that you forget the front after practicing the latter.

  • The essence of solving
    the weights of ptuning training is to practice model.transformer.prefix_encoder.
    The idea is to mix the old prefix_encoder weights with the weights of ptuning training, and then re-inject them into model.transformer.prefix_encoder.
    After my test, the old weight is 0.2 and the new weight is 0.8. In this way, the weighted average model can say hello and retain the ability of ptuning training. However,
    this method of averaging mixed weights is more metaphysical and may not be useful
    . Script: Solution script

2. ChatGLM2-6b ptuning

  • Doka training.bash
```bash
export PRE_SEQ_LEN=128
export LR=2e-2
export NUM_GPUS=1
# export WANDB_MODE=dryrun
export WANDB_DISABLED=true

torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS ptuning/main.py \
    --do_train \
    --train_file data/AdvertiseGen/train.json \
    --validation_file data/AdvertiseGen/dev.json \
    --preprocessing_num_workers 4 \
    --prompt_column content \
    --response_column summary \
    --overwrite_cache \
    --model_name_or_path /home/geekplusa/ai/models/bigmodels/prepare_models/chatglm/chatglm2-6b-32k-int4 \
    --output_dir /home/geekplusa/ai/models/bigmodels/train_models/chatglm/chatglm2/adgen-chatglm2-6b-pt-$PRE_SEQ_LEN-$LR \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 256 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --predict_with_generate \
    --max_steps 2000 \
    --logging_steps 10 \
    --save_steps 500 \
    --learning_rate $LR \
    --pre_seq_len $PRE_SEQ_LEN \
    --quantization_bit 4
```
  • Doka training. Running in the background
```bash
export PRE_SEQ_LEN=128
export LR=2e-2
export NUM_GPUS=4
# export WANDB_MODE=dryrun
export WANDB_DISABLED=true

nohup torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS ptuning/main.py     --do_train     --train_file data/AdvertiseGen/train_min.json     --validation_file data/AdvertiseGen/dev_min.json     --preprocessing_num_workers 4     --prompt_column content     --response_column summary     --overwrite_cache     --model_name_or_path models/chatglm2-6b-int4     --output_dir models/adgen-chatglm2-6b-int4-pt-128-2e-e     --overwrite_output_dir     --max_source_length 64     --max_target_length 256     --per_device_train_batch_size 1     --per_device_eval_batch_size 1     --gradient_accumulation_steps 16     --predict_with_generate     --max_steps 100     --logging_steps 10     --save_steps 50     --learning_rate $LR     --pre_seq_len $PRE_SEQ_LEN     --quantization_bit 4 > log 2>&1 &
```
  • Single card training
```bash
export WANDB_DISABLED=true
export PRE_SEQ_LEN=128
export LR=2e-2

CUDA_VISIBLE_DEVICES=0 python ptuning/main.py \
    --do_train \
    --train_file data/AdvertiseGen/train_min.json \
    --validation_file data/AdvertiseGen/val_min.json \
    --prompt_column content \
    --response_column summary \
    --overwrite_cache \
    --model_name_or_path /home/geekplusa/ai/models/bigmodels/prepare_models/chatglm/chatglm2-6b-int4 \
    --output_dir /home/geekplusa/ai/models/bigmodels/train_models/chatglm/chatglm2/adgen-chatglm2-6b-pt-医院1-$PRE_SEQ_LEN-$LR \
    --overwrite_output_dir \
    --max_source_length 64 \
    --max_target_length 1000 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --predict_with_generate \
    --max_steps 1 \
    --logging_steps 10 \
    --save_steps 1 \
    --learning_rate 2e-2 \
    --pre_seq_len 128 \
    --quantization_bit 4
```

3. ChatGLM2-6b deployment

This includes basic model single-card deployment, basic model multi-card deployment, ptuning model single-card deployment, and ptuning model multi-card deployment.
See the script below.web_demo2.py

from transformers import AutoModel, AutoTokenizer
from transformers import AutoConfig
import streamlit as st
import os


st.set_page_config(
    page_title="ChatGLM2-6b 演示",
    page_icon=":robot:",
    layout='wide'
)


@st.cache_resource
def get_model_onegpu():
    tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
    model = AutoModel.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True).cuda()
    return tokenizer, model

@st.cache_resource
def get_model_mitugpu():
    tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
    from utils import load_model_on_gpus
    model = load_model_on_gpus("models/chatglm2-6b-int4", num_gpus=4)
    return tokenizer, model

@st.cache_resource
def get_model_ptuning_onegpu():
    tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
    import torch
    CHECKPOINT_PATH = "models/gukai/checkpoint-500/"
    config = AutoConfig.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True, pre_seq_len=128)
    model = AutoModel.from_pretrained("models/chatglm2-6b-int4", config=config, trust_remote_code=True)
    prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
    new_prefix_state_dict = {}
    for k, v in prefix_state_dict.items():
        if k.startswith("transformer.prefix_encoder."):
            new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
    model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
    model = model.quantize(4)
    model = model.cuda()
    return tokenizer, model

@st.cache_resource
def get_model_ptuning_mutigpu():
    tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
    import torch
    from utils import load_model_on_gpus
    CHECKPOINT_PATH = "models/gukai/checkpoint-500/"
    prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
    #prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"), map_location=lambda storage, loc: storage.cuda(1))
    new_prefix_state_dict = {}
    for k, v in prefix_state_dict.items():
        if k.startswith("transformer.prefix_encoder."):
            new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
    model = load_model_on_gpus("models/chatglm2-6b-int4", num_gpus=4, pre_seq_len=128)
    model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
    model.transformer.prefix_encoder.float()
    model = model.quantize(4)
    #model = model.cuda()
    return tokenizer, model


@st.cache_resource
def get_model():
    tokenizer = AutoTokenizer.from_pretrained("models/gukai/checkpoint-500", trust_remote_code=True)
    model = AutoModel.from_pretrained("models/gukai/checkpoint-500", trust_remote_code=True).cuda()
    # 多显卡支持,使用下面两行代替上面一行,将num_gpus改为你实际的显卡数量
    # from utils import load_model_on_gpus
    # model = load_model_on_gpus("THUDM/chatglm2-6b", num_gpus=2)
    model = model.eval()
    return tokenizer, model


#tokenizer, model = get_model()

tokenizer, model = get_model_ptuning_mutigpu()


st.title("ChatGLM2-6B")

max_length = st.sidebar.slider(
    'max_length', 0, 32768, 8192, step=1
)
top_p = st.sidebar.slider(
    'top_p', 0.0, 1.0, 0.8, step=0.01
)
temperature = st.sidebar.slider(
    'temperature', 0.0, 1.0, 0.8, step=0.01
)

if 'history' not in st.session_state:
    st.session_state.history = []

if 'past_key_values' not in st.session_state:
    st.session_state.past_key_values = None

for i, (query, response) in enumerate(st.session_state.history):
    with st.chat_message(name="user", avatar="user"):
        st.markdown(query)
    with st.chat_message(name="assistant", avatar="assistant"):
        st.markdown(response)
with st.chat_message(name="user", avatar="user"):
    input_placeholder = st.empty()
with st.chat_message(name="assistant", avatar="assistant"):
    message_placeholder = st.empty()

prompt_text = st.text_area(label="用户命令输入",
                           height=100,
                           placeholder="请在这儿输入您的命令")

button = st.button("发送", key="predict")

if button:
    input_placeholder.markdown(prompt_text)
    history, past_key_values = st.session_state.history, st.session_state.past_key_values
    for response, history, past_key_values in model.stream_chat(tokenizer, prompt_text, history,
                                                                past_key_values=past_key_values,
                                                                max_length=max_length, top_p=top_p,
                                                                temperature=temperature,
                                                                return_past_key_values=True):
        message_placeholder.markdown(response)

    st.session_state.history = history
    st.session_state.past_key_values = past_key_values

Guess you like

Origin blog.csdn.net/qq122716072/article/details/132688425