CLIP模型导出ONNX模型

openai CLIP

GitHub - openai/CLIP: CLIP (Contrastive Language-Image Pretraining), Predict the most relevant text snippet given an image

import torch
import torch.nn as nn
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

model.float()
model.eval()

image = preprocess(Image.open("clip_dog.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a dog", "a cat"]).to(device)

print("text:", text)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

# export to ONNX


class ImgModelWrapper(nn.Module):
    def __init__(self):
        super(ImgModelWrapper, self).__init__()
        self.model = model

    def forward(self, image):
        image_features = model.encode_image(image)
        return image_features


class TxtModelWrapper(nn.Module):
    def __init__(self):
        super(TxtModelWrapper, self).__init__()
        self.model = model

    def forward(self, image):
        text_features = model.encode_text(text)
        return text_features


img_model = ImgModelWrapper()
txt_model = TxtModelWrapper()

torch.onnx.export(img_model,               # model being run
                  image,                         # model input (or a tuple for multiple inputs)
                  "openai_vit_img.onnx",   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=False,  # whether to execute constant folding for optimization
                  input_names=['input'],   # the model's input names
                  output_names=['output'],  # the model's output names
                  dynamic_axes={'input': {0: 'batch'}})
torch.onnx.export(txt_model,               # model being run
                  text,                         # model input (or a tuple for multiple inputs)
                  "openai_vit_txt.onnx",   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=12,          # the ONNX version to export the model to
                  do_constant_folding=False,  # whether to execute constant folding for optimization
                  input_names=['input'],   # the model's input names
                  output_names=['output'],  # the model's output names
                  dynamic_axes={'input': {0: 'batch'}})

chinese-clip用上面同样的方法

GitHub - OFA-Sys/Chinese-CLIP: Chinese version of CLIP which achieves Chinese cross-modal retrieval and representation generation.

关于CLIP模型的一个研究方向我觉得可能是：不同语言描述的同一句话得到的语义特征向量是相似的，但是目前每个语言跟相应的图像去训练一个clip txt和img模型，中间的特征向量是不同的，这导致不同语言模型用到的模型不一样。如果每个语言都能生成相似的语义向量，那么只需要更换不同的txt encoder而使用相同的img encoder。同样，这个CLIP模型结合stable diffusion时，不用根据每个语言重新训练stable diffusion，而是重新训练txt encoder即可。

类似：

FlagAI/README.md at master · FlagAI-Open/FlagAI · GitHub

但是这个工作只是统一了少量几个多语言模型的语义，仍然需要重新训练stable diffusion，我的理想情况是不需要重新训练stable diffusion，而是任何一个语言只需要重新训练一个对齐的txt encoder。

CLIP模型导出ONNX模型

猜你喜欢