Spaces:

devleenaaaaaa
/

Depath-Aware-Captioning

Runtime error

File size: 5,618 Bytes

05fc032

# ----------------------------
# STEP 1: Imports
# ----------------------------
import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import re
import cv2
import gradio as gr

# Add Depth Anything repo to path
sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2")

from huggingface_hub import hf_hub_download
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import

# ----------------------------
# STEP 2: Load Models
# ----------------------------

# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# Load Kosmos-2
print("📦 Loading Kosmos-2...")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained(
    "microsoft/kosmos-2-patch14-224"
).to(device)

# Load Depth Anything V2
print("📦 Loading Depth Anything V2...")
model_config = {
    'encoder': 'vitl',
    'features': 256,
    'out_channels': [256, 512, 1024, 1024],
}
model_depth = DepthAnythingV2(**model_config)
checkpoint_path = hf_hub_download(
    repo_id="depth-anything/Depth-Anything-V2-Large",
    filename="depth_anything_v2_vitl.pth",
    repo_type="model"
)
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
model_depth.load_state_dict(state_dict)
model_depth = model_depth.to(device).eval()

# ----------------------------
# STEP 3: Caption Generator
# ----------------------------

def generate_caption(image_array):
    try:
        import time
        print("🔁 Resizing image for Kosmos-2...")
        resized = cv2.resize(image_array.astype("uint8"), (224, 224))
        pil_image = Image.fromarray(resized)

        prompt = "<grounding> An image of"
        inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device)

        print("✍️ Running caption generation...")
        start = time.time()

        outputs = model_kosmos.generate(
            pixel_values=inputs["pixel_values"],
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            image_embeds=None,
            image_embeds_position_mask=inputs["image_embeds_position_mask"],
            max_new_tokens=32,  # reduced for speed
        )

        end = time.time()
        print(f"⏱️ Captioning took: {end - start:.2f} seconds")

        raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
        phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text)

        if phrases:
            return ", ".join(phrases) if len(phrases) > 1 else phrases[0]
        return "No description found."

    except Exception as e:
        print(f"❌ Captioning error: {e}")
        return f"Error: {e}"


# ----------------------------
# STEP 4: Depth Captioning Pipeline
# ----------------------------
def depth_caption_pipeline(uploaded_image):
    try:
        print("📥 Image uploaded.")
        image_np = np.array(uploaded_image.convert("RGB"))

        print("🧠 Estimating depth...")
        with torch.no_grad():
            depth_map = model_depth.infer_image(image_np[:, :, ::-1])  # BGR
        depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0
        depth_gray = depth_norm.astype(np.uint8)

        print("🔪 Segmenting image...")
        top30 = np.percentile(depth_gray.flatten(), 70)
        bottom30 = np.percentile(depth_gray.flatten(), 30)
        top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1)
        mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1)
        bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1)

        top_image = np.where(top_mask_3d, image_np, 0)
        mid_image = np.where(mid_mask_3d, image_np, 0)
        bottom_image = np.where(bottom_mask_3d, image_np, 0)

        print("📝 Generating captions...")
        caption_top = generate_caption(top_image)
        caption_mid = generate_caption(mid_image)
        caption_bottom = generate_caption(bottom_image)

        print("✅ Completed successfully.")
        return (
            Image.fromarray(top_image.astype("uint8")),
            Image.fromarray(mid_image.astype("uint8")),
            Image.fromarray(bottom_image.astype("uint8")),
            caption_top,
            caption_mid,
            caption_bottom
        )

    except Exception as e:
        print(f"❌ Pipeline error: {e}")
        return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}")

# ----------------------------
# STEP 5: Gradio Interface
# ----------------------------
demo = gr.Interface(
    fn=depth_caption_pipeline,
    inputs=gr.Image(type="pil", label="📤 Upload an Image"),
    outputs=[
        gr.Image(label="Foreground (Top 30%)"),
        gr.Image(label="Midground (Mid 40%)"),
        gr.Image(label="Background (Bottom 30%)"),
        gr.Textbox(label="Caption - Foreground"),
        gr.Textbox(label="Caption - Midground"),
        gr.Textbox(label="Caption - Background"),
    ],
    title="Depth-Aware Image Captioning",
    description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding."
)

print("🚀 Launching Gradio App...")
demo.launch(debug=True, share=True)