from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch
import re

BLIP2_MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
BLIP_DEVICE = "cpu"
MAX_LENGTH = 120

processor = None
model = None

def lazy_load_blip2():
    global processor, model
    if processor is None or model is None:
        print("\U0001F680 [BLIP2] Loading BLIP-2 model and processor on CPU...")
        processor = Blip2Processor.from_pretrained(BLIP2_MODEL_NAME)
        model = Blip2ForConditionalGeneration.from_pretrained(
            BLIP2_MODEL_NAME,
            torch_dtype=torch.float32
        ).to(BLIP_DEVICE).eval()

def clean_caption(text: str) -> str:
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    text = text.strip(' "\n')
    return text[0].upper() + text[1:] if text else text

def describe_uploaded_images(images: list[Image.Image]) -> dict:
    if not images:
        return {"style_description": "", "full_caption": ""}

    lazy_load_blip2()

    captions = []
    prompt = (
        "Describe this image in detail. Focus on the art medium, visual style, mood or tone, lighting or rendering cues, "
        "and describe how people interact with objects if applicable."
    )

    for img in images:
        try:
            inputs = processor(images=img, text=prompt, return_tensors="pt").to(BLIP_DEVICE)
            generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH)
            caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            cleaned = clean_caption(caption)
            if cleaned and cleaned not in captions:
                captions.append(cleaned)
        except Exception as e:
            print(f"❌ [BLIP-2 ERROR] Failed to describe image: {e}")
            continue

    joined_caption = "; ".join(captions)
    return {
        "style_description": joined_caption,
        "full_caption": joined_caption
    }