# ---------------------------- # STEP 1: Imports # ---------------------------- import os import sys import torch import numpy as np import matplotlib.pyplot as plt from PIL import Image import re import cv2 import gradio as gr # Add Depth Anything repo to path sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2") from huggingface_hub import hf_hub_download from transformers import AutoProcessor, Kosmos2ForConditionalGeneration from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import # ---------------------------- # STEP 2: Load Models # ---------------------------- # Device config device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"🚀 Using device: {device}") # Load Kosmos-2 print("📦 Loading Kosmos-2...") processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained( "microsoft/kosmos-2-patch14-224" ).to(device) # Load Depth Anything V2 print("📦 Loading Depth Anything V2...") model_config = { 'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024], } model_depth = DepthAnythingV2(**model_config) checkpoint_path = hf_hub_download( repo_id="depth-anything/Depth-Anything-V2-Large", filename="depth_anything_v2_vitl.pth", repo_type="model" ) state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) model_depth.load_state_dict(state_dict) model_depth = model_depth.to(device).eval() # ---------------------------- # STEP 3: Caption Generator # ---------------------------- def generate_caption(image_array): try: import time print("🔁 Resizing image for Kosmos-2...") resized = cv2.resize(image_array.astype("uint8"), (224, 224)) pil_image = Image.fromarray(resized) prompt = " An image of" inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device) print("✍️ Running caption generation...") start = time.time() outputs = model_kosmos.generate( pixel_values=inputs["pixel_values"], input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], image_embeds=None, image_embeds_position_mask=inputs["image_embeds_position_mask"], max_new_tokens=32, # reduced for speed ) end = time.time() print(f"⏱️ Captioning took: {end - start:.2f} seconds") raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] phrases = re.findall(r"(.*?)", raw_text) if phrases: return ", ".join(phrases) if len(phrases) > 1 else phrases[0] return "No description found." except Exception as e: print(f"❌ Captioning error: {e}") return f"Error: {e}" # ---------------------------- # STEP 4: Depth Captioning Pipeline # ---------------------------- def depth_caption_pipeline(uploaded_image): try: print("📥 Image uploaded.") image_np = np.array(uploaded_image.convert("RGB")) print("🧠 Estimating depth...") with torch.no_grad(): depth_map = model_depth.infer_image(image_np[:, :, ::-1]) # BGR depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0 depth_gray = depth_norm.astype(np.uint8) print("🔪 Segmenting image...") top30 = np.percentile(depth_gray.flatten(), 70) bottom30 = np.percentile(depth_gray.flatten(), 30) top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1) mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1) bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1) top_image = np.where(top_mask_3d, image_np, 0) mid_image = np.where(mid_mask_3d, image_np, 0) bottom_image = np.where(bottom_mask_3d, image_np, 0) print("📝 Generating captions...") caption_top = generate_caption(top_image) caption_mid = generate_caption(mid_image) caption_bottom = generate_caption(bottom_image) print("✅ Completed successfully.") return ( Image.fromarray(top_image.astype("uint8")), Image.fromarray(mid_image.astype("uint8")), Image.fromarray(bottom_image.astype("uint8")), caption_top, caption_mid, caption_bottom ) except Exception as e: print(f"❌ Pipeline error: {e}") return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}") # ---------------------------- # STEP 5: Gradio Interface # ---------------------------- demo = gr.Interface( fn=depth_caption_pipeline, inputs=gr.Image(type="pil", label="📤 Upload an Image"), outputs=[ gr.Image(label="Foreground (Top 30%)"), gr.Image(label="Midground (Mid 40%)"), gr.Image(label="Background (Bottom 30%)"), gr.Textbox(label="Caption - Foreground"), gr.Textbox(label="Caption - Midground"), gr.Textbox(label="Caption - Background"), ], title="Depth-Aware Image Captioning", description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding." ) print("🚀 Launching Gradio App...") demo.launch(debug=True, share=True)