openai CLIP
import torch
import torch.nn as nn
import clip
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.float()
model.eval()
image = preprocess(Image.open("clip_dog.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a dog", "a cat"]).to(device)
print("text:", text)
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]
# export to ONNX
class ImgModelWrapper(nn.Module):
def __init__(self):
super(ImgModelWrapper, self).__init__()
self.model = model
def forward(self, image):
image_features = model.encode_image(image)
return image_features
class TxtModelWrapper(nn.Module):
def __init__(self):
super(TxtModelWrapper, self).__init__()
self.model = model
def forward(self, image):
text_features = model.encode_text(text)
return text_features
img_model = ImgModelWrapper()
txt_model = TxtModelWrapper()
torch.onnx.export(img_model, # model being run
image, # model input (or a tuple for multiple inputs)
"openai_vit_img.onnx", # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=12, # the ONNX version to export the model to
do_constant_folding=False, # whether to execute constant folding for optimization
input_names=['input'], # the model's input names
output_names=['output'], # the model's output names
dynamic_axes={'input': {0: 'batch'}})
torch.onnx.export(txt_model, # model being run
text, # model input (or a tuple for multiple inputs)
"openai_vit_txt.onnx", # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=12, # the ONNX version to export the model to
do_constant_folding=False, # whether to execute constant folding for optimization
input_names=['input'], # the model's input names
output_names=['output'], # the model's output names
dynamic_axes={'input': {0: 'batch'}})
chinese-clip用上面同样的方法
关于CLIP模型的一个研究方向我觉得可能是:不同语言描述的同一句话得到的语义特征向量是相似的,但是目前每个语言跟相应的图像去训练一个clip txt和img模型,中间的特征向量是不同的,这导致不同语言模型用到的模型不一样。如果每个语言都能生成相似的语义向量,那么只需要更换不同的txt encoder而使用相同的img encoder。同样,这个CLIP模型结合stable diffusion时,不用根据每个语言重新训练stable diffusion,而是重新训练txt encoder即可。
类似:
FlagAI/README.md at master · FlagAI-Open/FlagAI · GitHub
但是这个工作只是统一了少量几个多语言模型的语义,仍然需要重新训练stable diffusion,我的理想情况是不需要重新训练stable diffusion,而是任何一个语言只需要重新训练一个对齐的txt encoder。