Spaces:
Paused
Paused
from transformers import Blip2Processor, Blip2ForConditionalGeneration | |
from PIL import Image | |
import torch | |
import re | |
BLIP2_MODEL_NAME = "Salesforce/blip2-flan-t5-xl" | |
BLIP_DEVICE = "cpu" | |
MAX_LENGTH = 120 | |
processor = None | |
model = None | |
def lazy_load_blip2(): | |
global processor, model | |
if processor is None or model is None: | |
print("\U0001F680 [BLIP2] Loading BLIP-2 model and processor on CPU...") | |
processor = Blip2Processor.from_pretrained(BLIP2_MODEL_NAME) | |
model = Blip2ForConditionalGeneration.from_pretrained( | |
BLIP2_MODEL_NAME, | |
torch_dtype=torch.float32 | |
).to(BLIP_DEVICE).eval() | |
def clean_caption(text: str) -> str: | |
text = text.strip() | |
text = re.sub(r"\s+", " ", text) | |
text = text.strip(' "\n') | |
return text[0].upper() + text[1:] if text else text | |
def describe_uploaded_images(images: list[Image.Image]) -> dict: | |
if not images: | |
return {"style_description": "", "full_caption": ""} | |
lazy_load_blip2() | |
captions = [] | |
prompt = ( | |
"Describe this image in detail. Focus on the art medium, visual style, mood or tone, lighting or rendering cues, " | |
"and describe how people interact with objects if applicable." | |
) | |
for img in images: | |
try: | |
inputs = processor(images=img, text=prompt, return_tensors="pt").to(BLIP_DEVICE) | |
generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH) | |
caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True) | |
cleaned = clean_caption(caption) | |
if cleaned and cleaned not in captions: | |
captions.append(cleaned) | |
except Exception as e: | |
print(f"β [BLIP-2 ERROR] Failed to describe image: {e}") | |
continue | |
joined_caption = "; ".join(captions) | |
return { | |
"style_description": joined_caption, | |
"full_caption": joined_caption | |
} | |