Spaces:

Manireddy1508
/

uno-final

Paused

App Files Files Community

uno-final / uno /utils /image_describer.py

Manireddy1508

Upload 5 files

a4075b9 verified 12 days ago

raw

history blame contribute delete

1.94 kB

	from transformers import Blip2Processor, Blip2ForConditionalGeneration
	from PIL import Image
	import torch
	import re

	BLIP2_MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
	BLIP_DEVICE = "cpu"
	MAX_LENGTH = 120

	processor = None
	model = None

	def lazy_load_blip2():
	global processor, model
	if processor is None or model is None:
	print("\U0001F680 [BLIP2] Loading BLIP-2 model and processor on CPU...")
	processor = Blip2Processor.from_pretrained(BLIP2_MODEL_NAME)
	model = Blip2ForConditionalGeneration.from_pretrained(
	BLIP2_MODEL_NAME,
	torch_dtype=torch.float32
	).to(BLIP_DEVICE).eval()

	def clean_caption(text: str) -> str:
	text = text.strip()
	text = re.sub(r"\s+", " ", text)
	text = text.strip(' "\n')
	return text[0].upper() + text[1:] if text else text

	def describe_uploaded_images(images: list[Image.Image]) -> dict:
	if not images:
	return {"style_description": "", "full_caption": ""}

	lazy_load_blip2()

	captions = []
	prompt = (
	"Describe this image in detail. Focus on the art medium, visual style, mood or tone, lighting or rendering cues, "
	"and describe how people interact with objects if applicable."
	)

	for img in images:
	try:
	inputs = processor(images=img, text=prompt, return_tensors="pt").to(BLIP_DEVICE)
	generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH)
	caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
	cleaned = clean_caption(caption)
	if cleaned and cleaned not in captions:
	captions.append(cleaned)
	except Exception as e:
	print(f"❌ [BLIP-2 ERROR] Failed to describe image: {e}")
	continue

	joined_caption = "; ".join(captions)
	return {
	"style_description": joined_caption,
	"full_caption": joined_caption
	}