Transformers save the quantized model and load it

from transformers import AutoTokenizer, AutoModel
from transformers.generation.utils import logger
import warnings
import time, os, torch

logger.setLevel("ERROR")
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = "/home/weights/nga_lora_glm/model_best"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)

print('model load done...')

tttmp = time.time()
quantization_bit = 4
print(f"Quantized to {
      
      quantization_bit} bit")
model = model.quantize(quantization_bit)
print("model quantized done, use time: {}s".format(time.time() - tttmp))

model = model.to(device)

cur_save_dir = "/home/weights/nga_tmp_bit/"
model.save_pretrained(cur_save_dir)
tokenizer.save_pretrained(cur_save_dir)

Guess you like

Origin blog.csdn.net/qq_42363032/article/details/130847170