devleenaaaaaa's picture
Upload 2 files
05fc032 verified
# ----------------------------
# STEP 1: Imports
# ----------------------------
import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import re
import cv2
import gradio as gr
# Add Depth Anything repo to path
sys.path.append(r"C:\Users\Devleena\Desktop\New folder (3)\Depth-Anything-V2")
from huggingface_hub import hf_hub_download
from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
from depth_anything_v2.dpt import DepthAnythingV2 # Corrected import
# ----------------------------
# STEP 2: Load Models
# ----------------------------
# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"πŸš€ Using device: {device}")
# Load Kosmos-2
print("πŸ“¦ Loading Kosmos-2...")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
model_kosmos = Kosmos2ForConditionalGeneration.from_pretrained(
"microsoft/kosmos-2-patch14-224"
).to(device)
# Load Depth Anything V2
print("πŸ“¦ Loading Depth Anything V2...")
model_config = {
'encoder': 'vitl',
'features': 256,
'out_channels': [256, 512, 1024, 1024],
}
model_depth = DepthAnythingV2(**model_config)
checkpoint_path = hf_hub_download(
repo_id="depth-anything/Depth-Anything-V2-Large",
filename="depth_anything_v2_vitl.pth",
repo_type="model"
)
state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
model_depth.load_state_dict(state_dict)
model_depth = model_depth.to(device).eval()
# ----------------------------
# STEP 3: Caption Generator
# ----------------------------
def generate_caption(image_array):
try:
import time
print("πŸ” Resizing image for Kosmos-2...")
resized = cv2.resize(image_array.astype("uint8"), (224, 224))
pil_image = Image.fromarray(resized)
prompt = "<grounding> An image of"
inputs = processor(text=prompt, images=pil_image, return_tensors="pt").to(device)
print("✍️ Running caption generation...")
start = time.time()
outputs = model_kosmos.generate(
pixel_values=inputs["pixel_values"],
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
image_embeds=None,
image_embeds_position_mask=inputs["image_embeds_position_mask"],
max_new_tokens=32, # reduced for speed
)
end = time.time()
print(f"⏱️ Captioning took: {end - start:.2f} seconds")
raw_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
phrases = re.findall(r"<phrase>(.*?)</phrase>", raw_text)
if phrases:
return ", ".join(phrases) if len(phrases) > 1 else phrases[0]
return "No description found."
except Exception as e:
print(f"❌ Captioning error: {e}")
return f"Error: {e}"
# ----------------------------
# STEP 4: Depth Captioning Pipeline
# ----------------------------
def depth_caption_pipeline(uploaded_image):
try:
print("πŸ“₯ Image uploaded.")
image_np = np.array(uploaded_image.convert("RGB"))
print("🧠 Estimating depth...")
with torch.no_grad():
depth_map = model_depth.infer_image(image_np[:, :, ::-1]) # BGR
depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) * 255.0
depth_gray = depth_norm.astype(np.uint8)
print("πŸ”ͺ Segmenting image...")
top30 = np.percentile(depth_gray.flatten(), 70)
bottom30 = np.percentile(depth_gray.flatten(), 30)
top_mask_3d = np.stack([(depth_gray > top30)] * 3, axis=-1)
mid_mask_3d = np.stack([((depth_gray >= bottom30) & (depth_gray <= top30))] * 3, axis=-1)
bottom_mask_3d = np.stack([(depth_gray < bottom30)] * 3, axis=-1)
top_image = np.where(top_mask_3d, image_np, 0)
mid_image = np.where(mid_mask_3d, image_np, 0)
bottom_image = np.where(bottom_mask_3d, image_np, 0)
print("πŸ“ Generating captions...")
caption_top = generate_caption(top_image)
caption_mid = generate_caption(mid_image)
caption_bottom = generate_caption(bottom_image)
print("βœ… Completed successfully.")
return (
Image.fromarray(top_image.astype("uint8")),
Image.fromarray(mid_image.astype("uint8")),
Image.fromarray(bottom_image.astype("uint8")),
caption_top,
caption_mid,
caption_bottom
)
except Exception as e:
print(f"❌ Pipeline error: {e}")
return (None, None, None, f"Error: {e}", f"Error: {e}", f"Error: {e}")
# ----------------------------
# STEP 5: Gradio Interface
# ----------------------------
demo = gr.Interface(
fn=depth_caption_pipeline,
inputs=gr.Image(type="pil", label="πŸ“€ Upload an Image"),
outputs=[
gr.Image(label="Foreground (Top 30%)"),
gr.Image(label="Midground (Mid 40%)"),
gr.Image(label="Background (Bottom 30%)"),
gr.Textbox(label="Caption - Foreground"),
gr.Textbox(label="Caption - Midground"),
gr.Textbox(label="Caption - Background"),
],
title="Depth-Aware Image Captioning",
description="Upload an image to generate layer-wise captions using Depth Anything + Kosmos-2. Powered by vision-language grounding."
)
print("πŸš€ Launching Gradio App...")
demo.launch(debug=True, share=True)