Spaces:
Paused
Paused
File size: 1,940 Bytes
a4075b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch
import re
BLIP2_MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
BLIP_DEVICE = "cpu"
MAX_LENGTH = 120
processor = None
model = None
def lazy_load_blip2():
global processor, model
if processor is None or model is None:
print("\U0001F680 [BLIP2] Loading BLIP-2 model and processor on CPU...")
processor = Blip2Processor.from_pretrained(BLIP2_MODEL_NAME)
model = Blip2ForConditionalGeneration.from_pretrained(
BLIP2_MODEL_NAME,
torch_dtype=torch.float32
).to(BLIP_DEVICE).eval()
def clean_caption(text: str) -> str:
text = text.strip()
text = re.sub(r"\s+", " ", text)
text = text.strip(' "\n')
return text[0].upper() + text[1:] if text else text
def describe_uploaded_images(images: list[Image.Image]) -> dict:
if not images:
return {"style_description": "", "full_caption": ""}
lazy_load_blip2()
captions = []
prompt = (
"Describe this image in detail. Focus on the art medium, visual style, mood or tone, lighting or rendering cues, "
"and describe how people interact with objects if applicable."
)
for img in images:
try:
inputs = processor(images=img, text=prompt, return_tensors="pt").to(BLIP_DEVICE)
generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH)
caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
cleaned = clean_caption(caption)
if cleaned and cleaned not in captions:
captions.append(cleaned)
except Exception as e:
print(f"❌ [BLIP-2 ERROR] Failed to describe image: {e}")
continue
joined_caption = "; ".join(captions)
return {
"style_description": joined_caption,
"full_caption": joined_caption
}
|