Spaces:

devleenaaaaaa
/

Depath-Aware-Captioning

Runtime error

App Files Files Community

Depath-Aware-Captioning / app.py

devleenaaaaaa

Upload 2 files

05fc032 verified 12 days ago

raw

history blame contribute delete

5.62 kB

	# ----------------------------
	# STEP 1: Imports
	# ----------------------------
	import os
	import sys
	import torch
	import numpy as np
	import matplotlib.pyplot as plt
	from PIL import Image
	import re
	import cv2
	import gradio as gr

	# Add Depth Anything repo to path
	sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2")

	from huggingface_hub import hf_hub_download
	from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
	from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import

	# ----------------------------
	# STEP 2: Load Models
	# ----------------------------

	# Device config
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"🚀 Using device: {device}")

	# Load Kosmos-2
	print("📦 Loading Kosmos-2...")
	processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
	model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained(
	"microsoft/kosmos-2-patch14-224"
	).to(device)

	# Load Depth Anything V2
	print("📦 Loading Depth Anything V2...")
	model_config = {
	'encoder': 'vitl',
	'features': 256,
	'out_channels': [256, 512, 1024, 1024],
	}
	model_depth = DepthAnythingV2(**model_config)
	checkpoint_path = hf_hub_download(
	repo_id="depth-anything/Depth-Anything-V2-Large",
	filename="depth_anything_v2_vitl.pth",
	repo_type="model"
	)
	state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
	model_depth.load_state_dict(state_dict)
	model_depth = model_depth.to(device).eval()

	# ----------------------------
	# STEP 3: Caption Generator
	# ----------------------------

	def generate_caption(image_array):
	try:
	import time
	print("🔁 Resizing image for Kosmos-2...")
	resized = cv2.resize(image_array.astype("uint8"), (224, 224))
	pil_image = Image.fromarray(resized)

	prompt = "<grounding> An image of"
	inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device)

	print("✍️ Running caption generation...")
	start = time.time()

	outputs = model_kosmos.generate(
	pixel_values=inputs["pixel_values"],
	input_ids=inputs["input_ids"],
	attention_mask=inputs["attention_mask"],
	image_embeds=None,
	image_embeds_position_mask=inputs["image_embeds_position_mask"],
	max_new_tokens=32, # reduced for speed
	)

	end = time.time()
	print(f"⏱️ Captioning took: {end - start:.2f} seconds")

	raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
	phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text)

	if phrases:
	return ", ".join(phrases) if len(phrases) > 1 else phrases[0]
	return "No description found."

	except Exception as e:
	print(f"❌ Captioning error: {e}")
	return f"Error: {e}"


	# ----------------------------
	# STEP 4: Depth Captioning Pipeline
	# ----------------------------
	def depth_caption_pipeline(uploaded_image):
	try:
	print("📥 Image uploaded.")
	image_np = np.array(uploaded_image.convert("RGB"))

	print("🧠 Estimating depth...")
	with torch.no_grad():
	depth_map = model_depth.infer_image(image_np[:, :, ::-1]) # BGR
	depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0
	depth_gray = depth_norm.astype(np.uint8)

	print("🔪 Segmenting image...")
	top30 = np.percentile(depth_gray.flatten(), 70)
	bottom30 = np.percentile(depth_gray.flatten(), 30)
	top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1)
	mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1)
	bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1)

	top_image = np.where(top_mask_3d, image_np, 0)
	mid_image = np.where(mid_mask_3d, image_np, 0)
	bottom_image = np.where(bottom_mask_3d, image_np, 0)

	print("📝 Generating captions...")
	caption_top = generate_caption(top_image)
	caption_mid = generate_caption(mid_image)
	caption_bottom = generate_caption(bottom_image)

	print("✅ Completed successfully.")
	return (
	Image.fromarray(top_image.astype("uint8")),
	Image.fromarray(mid_image.astype("uint8")),
	Image.fromarray(bottom_image.astype("uint8")),
	caption_top,
	caption_mid,
	caption_bottom
	)

	except Exception as e:
	print(f"❌ Pipeline error: {e}")
	return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}")

	# ----------------------------
	# STEP 5: Gradio Interface
	# ----------------------------
	demo = gr.Interface(
	fn=depth_caption_pipeline,
	inputs=gr.Image(type="pil", label="📤 Upload an Image"),
	outputs=[
	gr.Image(label="Foreground (Top 30%)"),
	gr.Image(label="Midground (Mid 40%)"),
	gr.Image(label="Background (Bottom 30%)"),
	gr.Textbox(label="Caption - Foreground"),
	gr.Textbox(label="Caption - Midground"),
	gr.Textbox(label="Caption - Background"),
	],
	title="Depth-Aware Image Captioning",
	description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding."
	)

	print("🚀 Launching Gradio App...")
	demo.launch(debug=True, share=True)