Spaces:
Runtime error
Runtime error
File size: 5,618 Bytes
05fc032 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# ----------------------------
# STEP 1: Imports
# ----------------------------
import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import re
import cv2
import gradio as gr
# Add Depth Anything repo to path
sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2")
from huggingface_hub import hf_hub_download
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import
# ----------------------------
# STEP 2: Load Models
# ----------------------------
# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"π Using device: {device}")
# Load Kosmos-2
print("π¦ Loading Kosmos-2...")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained(
"microsoft/kosmos-2-patch14-224"
).to(device)
# Load Depth Anything V2
print("π¦ Loading Depth Anything V2...")
model_config = {
'encoder': 'vitl',
'features': 256,
'out_channels': [256, 512, 1024, 1024],
}
model_depth = DepthAnythingV2(**model_config)
checkpoint_path = hf_hub_download(
repo_id="depth-anything/Depth-Anything-V2-Large",
filename="depth_anything_v2_vitl.pth",
repo_type="model"
)
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
model_depth.load_state_dict(state_dict)
model_depth = model_depth.to(device).eval()
# ----------------------------
# STEP 3: Caption Generator
# ----------------------------
def generate_caption(image_array):
try:
import time
print("π Resizing image for Kosmos-2...")
resized = cv2.resize(image_array.astype("uint8"), (224, 224))
pil_image = Image.fromarray(resized)
prompt = "<grounding> An image of"
inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device)
print("βοΈ Running caption generation...")
start = time.time()
outputs = model_kosmos.generate(
pixel_values=inputs["pixel_values"],
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
image_embeds=None,
image_embeds_position_mask=inputs["image_embeds_position_mask"],
max_new_tokens=32, # reduced for speed
)
end = time.time()
print(f"β±οΈ Captioning took: {end - start:.2f} seconds")
raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text)
if phrases:
return ", ".join(phrases) if len(phrases) > 1 else phrases[0]
return "No description found."
except Exception as e:
print(f"β Captioning error: {e}")
return f"Error: {e}"
# ----------------------------
# STEP 4: Depth Captioning Pipeline
# ----------------------------
def depth_caption_pipeline(uploaded_image):
try:
print("π₯ Image uploaded.")
image_np = np.array(uploaded_image.convert("RGB"))
print("π§ Estimating depth...")
with torch.no_grad():
depth_map = model_depth.infer_image(image_np[:, :, ::-1]) # BGR
depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0
depth_gray = depth_norm.astype(np.uint8)
print("πͺ Segmenting image...")
top30 = np.percentile(depth_gray.flatten(), 70)
bottom30 = np.percentile(depth_gray.flatten(), 30)
top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1)
mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1)
bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1)
top_image = np.where(top_mask_3d, image_np, 0)
mid_image = np.where(mid_mask_3d, image_np, 0)
bottom_image = np.where(bottom_mask_3d, image_np, 0)
print("π Generating captions...")
caption_top = generate_caption(top_image)
caption_mid = generate_caption(mid_image)
caption_bottom = generate_caption(bottom_image)
print("β
Completed successfully.")
return (
Image.fromarray(top_image.astype("uint8")),
Image.fromarray(mid_image.astype("uint8")),
Image.fromarray(bottom_image.astype("uint8")),
caption_top,
caption_mid,
caption_bottom
)
except Exception as e:
print(f"β Pipeline error: {e}")
return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}")
# ----------------------------
# STEP 5: Gradio Interface
# ----------------------------
demo = gr.Interface(
fn=depth_caption_pipeline,
inputs=gr.Image(type="pil", label="π€ Upload an Image"),
outputs=[
gr.Image(label="Foreground (Top 30%)"),
gr.Image(label="Midground (Mid 40%)"),
gr.Image(label="Background (Bottom 30%)"),
gr.Textbox(label="Caption - Foreground"),
gr.Textbox(label="Caption - Midground"),
gr.Textbox(label="Caption - Background"),
],
title="Depth-Aware Image Captioning",
description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding."
)
print("π Launching Gradio App...")
demo.launch(debug=True, share=True)
|