Text-image similarity
Use CLIP to calculate the similarity between the image generated in the Text-to-Image task and the corresponding prompt. The larger the index, the better.
from tqdm import tqdm
from PIL import Image
import torch
import os
import numpy as np
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("clip-vit-base-patch16")
def get_clip_score(image_path,text):
image = Image.open(image_path)
inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
print(outputs)
logits_per_image = outputs.logits_per_image
print(logits_per_image, logits_per_image.shape) # 1,4
return logits_per_image
Example:
image_path='test.jpg'
text = ['dog','cat','pig'] # text must be a list
The corresponding output scores are [32.3232,52.2312,63.1298]
corresponding test.jpg
to ['dog','cat','pig']
the similarity of and , respectively.
It is worth noting that the pre-trained model clip-vit-base-patch16
needs to be downloaded in advance and placed in the project directory.
Here is a downloadable path: https://huggingface.co/openai/clip-vit-base-patch16/tree/main (you may need to download one by one)
Image-Image similarity
Use CLIP to calculate the similarity between two images. Unlike SSIM, PSNR, and MSE, the image similarity calculated here is the similarity at the image feature level, while SSIM, PSNR, and MSE mainly compare the pixel level after imaging. similarity.
import torch
from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer
# from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os
import cv2
# Load the CLIP model
model_ID = "clip-vit-base-patch16"
model = CLIPModel.from_pretrained(model_ID)
preprocess = CLIPImageProcessor.from_pretrained(model_ID)
# Define a function to load an image and preprocess it for CLIP
def load_and_preprocess_image(image_path):
# Load the image from the specified path
image = Image.open(image_path)
# Apply the CLIP preprocessing to the image
image = preprocess(image, return_tensors="pt")
# Return the preprocessed image
return image
def clip_img_score (img1_path,img2_path):
# Load the two images and preprocess them for CLIP
image_a = load_and_preprocess_image(img1_path)["pixel_values"]
image_b = load_and_preprocess_image(img2_path)["pixel_values"]
# Calculate the embeddings for the images using the CLIP model
with torch.no_grad():
embedding_a = model.get_image_features(image_a)
embedding_b = model.get_image_features(image_b)
# Calculate the cosine similarity between the embeddings
similarity_score = torch.nn.functional.cosine_similarity(embedding_a, embedding_b)
return similarity_score.item()
use:
score = clip_img_score(img1_path,img1_path) #give the path to two images.
It is worth noting that transformers
the version of .
The blogger tested pip install transformers==4.25.0
that the code can run normally.