|
from transformers import AutoProcessor, AutoModel |
|
import torch |
|
import gradio as gr |
|
from PIL import Image |
|
|
|
|
|
model_name = "google/siglip2-base-patch16-224" |
|
processor = AutoProcessor.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
def match_image_text(image, text): |
|
inputs = processor(text=text, images=image, return_tensors="pt", padding=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
image_embeds = outputs.image_embeds |
|
text_embeds = outputs.text_embeds |
|
|
|
|
|
similarity = torch.nn.functional.cosine_similarity(image_embeds, text_embeds).item() |
|
|
|
|
|
score = (similarity + 1) / 2 * 99 + 1 |
|
return f"💡 Matching Score: {score:.2f} / 100" |
|
|
|
|
|
gr.Interface( |
|
fn=match_image_text, |
|
inputs=[gr.Image(type="pil"), gr.Textbox(label="Enter a caption")], |
|
outputs="text", |
|
title="SigLIP2 Image-Text Matching Score", |
|
description="อัปโหลดภาพและใส่คำบรรยาย แล้วดูว่าเข้ากันแค่ไหน (1-100 คะแนน)" |
|
).launch() |