uno-final / uno /utils /image_describer.py
Manireddy1508's picture
Upload 5 files
a4075b9 verified
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch
import re
BLIP2_MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
BLIP_DEVICE = "cpu"
MAX_LENGTH = 120
processor = None
model = None
def lazy_load_blip2():
global processor, model
if processor is None or model is None:
print("\U0001F680 [BLIP2] Loading BLIP-2 model and processor on CPU...")
processor = Blip2Processor.from_pretrained(BLIP2_MODEL_NAME)
model = Blip2ForConditionalGeneration.from_pretrained(
BLIP2_MODEL_NAME,
torch_dtype=torch.float32
).to(BLIP_DEVICE).eval()
def clean_caption(text: str) -> str:
text = text.strip()
text = re.sub(r"\s+", " ", text)
text = text.strip(' "\n')
return text[0].upper() + text[1:] if text else text
def describe_uploaded_images(images: list[Image.Image]) -> dict:
if not images:
return {"style_description": "", "full_caption": ""}
lazy_load_blip2()
captions = []
prompt = (
"Describe this image in detail. Focus on the art medium, visual style, mood or tone, lighting or rendering cues, "
"and describe how people interact with objects if applicable."
)
for img in images:
try:
inputs = processor(images=img, text=prompt, return_tensors="pt").to(BLIP_DEVICE)
generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH)
caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
cleaned = clean_caption(caption)
if cleaned and cleaned not in captions:
captions.append(cleaned)
except Exception as e:
print(f"❌ [BLIP-2 ERROR] Failed to describe image: {e}")
continue
joined_caption = "; ".join(captions)
return {
"style_description": joined_caption,
"full_caption": joined_caption
}