from transformers import Blip2Processor, Blip2ForConditionalGeneration from PIL import Image import torch import re BLIP2_MODEL_NAME = "Salesforce/blip2-flan-t5-xl" BLIP_DEVICE = "cpu" MAX_LENGTH = 120 processor = None model = None def lazy_load_blip2(): global processor, model if processor is None or model is None: print("\U0001F680 [BLIP2] Loading BLIP-2 model and processor on CPU...") processor = Blip2Processor.from_pretrained(BLIP2_MODEL_NAME) model = Blip2ForConditionalGeneration.from_pretrained( BLIP2_MODEL_NAME, torch_dtype=torch.float32 ).to(BLIP_DEVICE).eval() def clean_caption(text: str) -> str: text = text.strip() text = re.sub(r"\s+", " ", text) text = text.strip(' "\n') return text[0].upper() + text[1:] if text else text def describe_uploaded_images(images: list[Image.Image]) -> dict: if not images: return {"style_description": "", "full_caption": ""} lazy_load_blip2() captions = [] prompt = ( "Describe this image in detail. Focus on the art medium, visual style, mood or tone, lighting or rendering cues, " "and describe how people interact with objects if applicable." ) for img in images: try: inputs = processor(images=img, text=prompt, return_tensors="pt").to(BLIP_DEVICE) generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH) caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True) cleaned = clean_caption(caption) if cleaned and cleaned not in captions: captions.append(cleaned) except Exception as e: print(f"❌ [BLIP-2 ERROR] Failed to describe image: {e}") continue joined_caption = "; ".join(captions) return { "style_description": joined_caption, "full_caption": joined_caption }