Spaces:
Runtime error
Runtime error
# ---------------------------- | |
# STEP 1: Imports | |
# ---------------------------- | |
import os | |
import sys | |
import torch | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
import re | |
import cv2 | |
import gradio as gr | |
# Add Depth Anything repo to path | |
sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2") | |
from huggingface_hub import hf_hub_download | |
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration | |
from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import | |
# ---------------------------- | |
# STEP 2: Load Models | |
# ---------------------------- | |
# Device config | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"π Using device: {device}") | |
# Load Kosmos-2 | |
print("π¦ Loading Kosmos-2...") | |
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") | |
model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained( | |
"microsoft/kosmos-2-patch14-224" | |
).to(device) | |
# Load Depth Anything V2 | |
print("π¦ Loading Depth Anything V2...") | |
model_config = { | |
'encoder': 'vitl', | |
'features': 256, | |
'out_channels': [256, 512, 1024, 1024], | |
} | |
model_depth = DepthAnythingV2(**model_config) | |
checkpoint_path = hf_hub_download( | |
repo_id="depth-anything/Depth-Anything-V2-Large", | |
filename="depth_anything_v2_vitl.pth", | |
repo_type="model" | |
) | |
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) | |
model_depth.load_state_dict(state_dict) | |
model_depth = model_depth.to(device).eval() | |
# ---------------------------- | |
# STEP 3: Caption Generator | |
# ---------------------------- | |
def generate_caption(image_array): | |
try: | |
import time | |
print("π Resizing image for Kosmos-2...") | |
resized = cv2.resize(image_array.astype("uint8"), (224, 224)) | |
pil_image = Image.fromarray(resized) | |
prompt = "<grounding> An image of" | |
inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device) | |
print("βοΈ Running caption generation...") | |
start = time.time() | |
outputs = model_kosmos.generate( | |
pixel_values=inputs["pixel_values"], | |
input_ids=inputs["input_ids"], | |
attention_mask=inputs["attention_mask"], | |
image_embeds=None, | |
image_embeds_position_mask=inputs["image_embeds_position_mask"], | |
max_new_tokens=32, # reduced for speed | |
) | |
end = time.time() | |
print(f"β±οΈ Captioning took: {end - start:.2f} seconds") | |
raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text) | |
if phrases: | |
return ", ".join(phrases) if len(phrases) > 1 else phrases[0] | |
return "No description found." | |
except Exception as e: | |
print(f"β Captioning error: {e}") | |
return f"Error: {e}" | |
# ---------------------------- | |
# STEP 4: Depth Captioning Pipeline | |
# ---------------------------- | |
def depth_caption_pipeline(uploaded_image): | |
try: | |
print("π₯ Image uploaded.") | |
image_np = np.array(uploaded_image.convert("RGB")) | |
print("π§ Estimating depth...") | |
with torch.no_grad(): | |
depth_map = model_depth.infer_image(image_np[:, :, ::-1]) # BGR | |
depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0 | |
depth_gray = depth_norm.astype(np.uint8) | |
print("πͺ Segmenting image...") | |
top30 = np.percentile(depth_gray.flatten(), 70) | |
bottom30 = np.percentile(depth_gray.flatten(), 30) | |
top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1) | |
mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1) | |
bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1) | |
top_image = np.where(top_mask_3d, image_np, 0) | |
mid_image = np.where(mid_mask_3d, image_np, 0) | |
bottom_image = np.where(bottom_mask_3d, image_np, 0) | |
print("π Generating captions...") | |
caption_top = generate_caption(top_image) | |
caption_mid = generate_caption(mid_image) | |
caption_bottom = generate_caption(bottom_image) | |
print("β Completed successfully.") | |
return ( | |
Image.fromarray(top_image.astype("uint8")), | |
Image.fromarray(mid_image.astype("uint8")), | |
Image.fromarray(bottom_image.astype("uint8")), | |
caption_top, | |
caption_mid, | |
caption_bottom | |
) | |
except Exception as e: | |
print(f"β Pipeline error: {e}") | |
return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}") | |
# ---------------------------- | |
# STEP 5: Gradio Interface | |
# ---------------------------- | |
demo = gr.Interface( | |
fn=depth_caption_pipeline, | |
inputs=gr.Image(type="pil", label="π€ Upload an Image"), | |
outputs=[ | |
gr.Image(label="Foreground (Top 30%)"), | |
gr.Image(label="Midground (Mid 40%)"), | |
gr.Image(label="Background (Bottom 30%)"), | |
gr.Textbox(label="Caption - Foreground"), | |
gr.Textbox(label="Caption - Midground"), | |
gr.Textbox(label="Caption - Background"), | |
], | |
title="Depth-Aware Image Captioning", | |
description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding." | |
) | |
print("π Launching Gradio App...") | |
demo.launch(debug=True, share=True) | |