Spaces:

FQiao
/

SoundingStreet

Running on Zero

File size: 6,864 Bytes

import os
import spaces

import gc
from pathlib import Path

import gradio as gr
import torch
import torchaudio

from config import LOGS_DIR, OUTPUT_DIR
from SoundMapper import SoundMapper
from GenerateAudio import GenerateAudio
from GenerateCaptions import generate_caption
from audio_mixer import compose_audio

# Ensure required directories exist
os.makedirs(LOGS_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Prepare external model dir and download checkpoint if missing
from pathlib import Path
depthfm_ckpt = Path('external_models/depth-fm/checkpoints/depthfm-v1.ckpt')
if not depthfm_ckpt.exists():
    depthfm_ckpt.parent.mkdir(parents=True, exist_ok=True)
    os.system('wget https://ommer-lab.com/files/depthfm/depthfm-v1.ckpt -P external_models/depth-fm/checkpoints/')


# Clear CUDA cache between runs
def clear_cuda():
    torch.cuda.empty_cache()
    gc.collect()


def process_images(
    image_dir: str,
    output_dir: str,
    panoramic: bool,
    view: str,
    model: str,
    location: str,
    audio_duration: int,
    cpu_only: bool
) -> None:
    # Existing processing logic, generates files in OUTPUT_DIR
    lat, lon = location.split(",")
    os.makedirs(output_dir, exist_ok=True)
    sound_mapper = SoundMapper()
    audio_generator = GenerateAudio()

    if panoramic:
        # Panoramic: generate per-view audio then composition
        view_results = generate_caption(lat, lon, view=view, model=model,
                                        cpu_only=cpu_only, panoramic=True)
        processed_maps = sound_mapper.process_depth_maps()
        image_paths = sorted(Path(image_dir).glob("*.jpg"))
        audios = {}
        for vr in view_results:
            cv = vr["view"]
            img_file = Path(image_dir) / f"{cv}.jpg"
            if not img_file.exists():
                continue
            idx = [i for i, p in enumerate(image_paths) if p.name == img_file.name]
            if not idx:
                continue
            depth_map = processed_maps[idx[0]]["normalization"]
            obj_depths = sound_mapper.analyze_object_depths(
                str(img_file), depth_map, lat, lon,
                caption_data=vr, all_objects=False
            )
            if not obj_depths:
                continue
            out_wav = Path(output_dir) / f"sound_{cv}.wav"
            audio, sr = audio_generator.process_and_generate_audio(
                obj_depths, duration=audio_duration
            )
            if audio.dim() == 3:
                audio = audio.squeeze(0)
            elif audio.dim() == 1:
                audio = audio.unsqueeze(0)
            torchaudio.save(str(out_wav), audio, sr)
            audios[cv] = str(out_wav)
        # final panoramic composition
        comp = Path(output_dir) / "panoramic_composition.wav"
        compose_audio(list(audios.values()), [1.0]*len(audios), str(comp))
        audios['panorama'] = str(comp)
        clear_cuda()
        return

    # Single-view: generate one audio
    vr = generate_caption(lat, lon, view=view, model=model,
                          cpu_only=cpu_only, panoramic=False)
    img_file = Path(image_dir) / f"{view}.jpg"
    processed_maps = sound_mapper.process_depth_maps()
    image_paths = sorted(Path(image_dir).glob("*.jpg"))
    idx = [i for i, p in enumerate(image_paths) if p.name == img_file.name]
    depth_map = processed_maps[idx[0]]["normalization"]
    obj_depths = sound_mapper.analyze_object_depths(
        str(img_file), depth_map, lat, lon,
        caption_data=vr, all_objects=True
    )
    out_wav = Path(output_dir) / f"sound_{view}.wav"
    audio, sr = audio_generator.process_and_generate_audio(obj_depths, duration=audio_duration)
    if audio.dim() == 3:
        audio = audio.squeeze(0)
    elif audio.dim() == 1:
        audio = audio.unsqueeze(0)
    torchaudio.save(str(out_wav), audio, sr)
    clear_cuda()

# Gradio UI
demo = gr.Blocks(title="Panoramic Audio Generator")
with demo:
    gr.Markdown("""
    # Panoramic Audio Generator

    Displays each view with its audio side by side.
    """
    )

    with gr.Row():
        panoramic = gr.Checkbox(label="Panoramic (multi-view)", value=False)
        view = gr.Dropdown(["front", "back", "left", "right"], value="front", label="View")
        location = gr.Textbox(value="52.3436723,4.8529625", label="Location (lat,lon)")
        model = gr.Textbox(value="intern_2_5-4B", label="Vision-Language Model")
        # model = "intern_2_5-4B"
        audio_duration = gr.Slider(1, 60, value=10, step=1, label="Audio Duration (sec)")
        cpu_only = gr.Checkbox(label="CPU Only", value=False)
        btn = gr.Button("Generate")

    # Output layout: two rows of two
    with gr.Row():
        with gr.Column():
            img_front = gr.Image(label="Front View", type="filepath")
            aud_front = gr.Audio(label="Front Audio", type="filepath")
        with gr.Column():
            img_back = gr.Image(label="Back View", type="filepath")
            aud_back = gr.Audio(label="Back Audio", type="filepath")
    with gr.Row():
        with gr.Column():
            img_left = gr.Image(label="Left View", type="filepath")
            aud_left = gr.Audio(label="Left Audio", type="filepath")
        with gr.Column():
            img_right = gr.Image(label="Right View", type="filepath")
            aud_right = gr.Audio(label="Right Audio", type="filepath")
    # Panorama at bottom
    img_pan = gr.Image(label="Panorama View", type="filepath")
    aud_pan = gr.Audio(label="Panoramic Audio", type="filepath")

    # Preview update
    @spaces.GPU
    def run_all(pan, vw, loc, mdl, dur, cpu):
        # generate files
        process_images(LOGS_DIR, OUTPUT_DIR, pan, vw, mdl, loc, dur, cpu)
        # collect files
        views = ["front", "back", "left", "right", "panorama"]
        paths = {}
        for v in views:
            img = Path(LOGS_DIR) / f"{v}.jpg"
            audio = Path(OUTPUT_DIR) / ("panoramic_composition.wav" if v == "panorama" else f"sound_{v}.wav")
            paths[v] = {
                'img': str(img) if img.exists() else None,
                'aud': str(audio) if audio.exists() else None
            }
        return (
            paths['front']['img'], paths['front']['aud'],
            paths['back']['img'],  paths['back']['aud'],
            paths['left']['img'],  paths['left']['aud'],
            paths['right']['img'], paths['right']['aud'],
            paths['panorama']['img'], paths['panorama']['aud']
        )

    btn.click(
        fn=run_all,
        inputs=[panoramic, view, location, model, audio_duration, cpu_only],
        outputs=[
            img_front, aud_front,
            img_back, aud_back,
            img_left, aud_left,
            img_right, aud_right,
            img_pan, aud_pan
        ]
    )

if __name__ == "__main__":
    demo.launch(show_api=False)