Spaces:

FQiao
/

SoundingStreet

Running on Zero

File size: 8,007 Bytes

3324de2

import os
import argparse
from PIL import Image
import numpy as np
import torch
import torchaudio
import gc
from config import LOGS_DIR, OUTPUT_DIR
from DepthEstimator import DepthEstimator
from SoundMapper import SoundMapper
from GenerateAudio import GenerateAudio
from GenerateCaptions import generate_caption
from audio_mixer import compose_audio

def main():
    parser = argparse.ArgumentParser(description="Generate sound from panoramic images")
    parser.add_argument("--image_dir", type=str, default=LOGS_DIR, help="Directory containing input images")
    parser.add_argument("--output_dir", type=str, default=OUTPUT_DIR, help="Directory for output files")
    parser.add_argument("--audio_duration", type=int, default=10, help="Duration of generated audio in seconds")
    parser.add_argument("--location", type=str, default="52.3436723,4.8529625", help='Location in format "latitude,longitude" (e.g., "40.7128,-74.0060")')
    parser.add_argument("--view", type=str, default="front", choices=["front", "back", "left", "right"], help="Perspective view to analyze")
    parser.add_argument("--model", type=str, default="intern_2_5-4B", help="Vision-language model to use for analysis")
    parser.add_argument("--cpu_only", action="store_true", help="Force CPU usage even if CUDA is available")
    parser.add_argument("--panoramic", action="store_true", default=False, 
                        help="Process panoramic images instead of a single image")    
    args = parser.parse_args()
    
    lat, lon = args.location.split(",")
    os.makedirs(args.output_dir, exist_ok=True)
    
    if args.panoramic:
        print("-----------Processing panoramic images-----------")
        # Generate captions for all views at once with panoramic=True
        view_results = generate_caption(lat, lon, view=args.view, model=args.model, 
                                      cpu_only=args.cpu_only, panoramic=True)
        if not view_results:
            print("Failed to generate captions for panoramic views")
            return
            
        sound_mapper = SoundMapper()
        processed_maps = sound_mapper.process_depth_maps()
        image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")]
        
        # Create audio generator
        audio_generator = GenerateAudio()
        sound_tracks_dict = {}  # keep track of sound tracks and their weight
        
        # Process each view
        for i, view_result in enumerate(view_results):
            current_view = view_result["view"]
            print(f"Processing {current_view} view ({i+1}/{len(view_results)})")
            
            # Find corresponding image path for this view
            image_path = os.path.join(args.image_dir, f"{current_view}.jpg")
            if not os.path.exists(image_path):
                print(f"Warning: Image file {image_path} not found")
                continue

            image_index = [idx for idx, path in enumerate(image_paths) 
                          if os.path.basename(path) == f"{current_view}.jpg"]
            
            if not image_index:
                print(f"Could not find processed map for {current_view} view")
                continue
                
            depth_map = processed_maps[image_index[0]]["normalization"]

            object_depths = sound_mapper.analyze_object_depths(
                image_path, depth_map, lat, lon, 
                caption_data=view_result, 
                all_objects=False
            )
            
            if not object_depths:
                print(f"No objects detected in the {current_view} view")
                continue
                
            # Generate audio for this view
            output_path = os.path.join(args.output_dir, f"sound_{current_view}.wav")
            print(f"Generating audio for {current_view} view...")
            
            audio, sample_rate = audio_generator.process_and_generate_audio(
                object_depths,
                duration=args.audio_duration
            )
            
            if audio.dim() == 3:
                audio = audio.squeeze(0)
            elif audio.dim() == 1:
                audio = audio.unsqueeze(0)
                
            if audio.dim() != 2:
                raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D")
            
            torchaudio.save(
                output_path,
                audio,
                sample_rate
            )
            
            if object_depths:
                sound_tracks_dict[output_path] = object_depths[0]['weight']
                
            print(f"Generated audio saved to: {output_path}")
            print("-" * 50)
            
        if sound_tracks_dict:
            print("Composing final audio from all views...")
            compose_audio(
                list(sound_tracks_dict.keys()), 
                list(sound_tracks_dict.values()),
                os.path.join(args.output_dir, "panoramic_composition.wav")
            )
            print(f"Final audio composition saved to: {os.path.join(args.output_dir, 'panoramic_composition.wav')}")

        torch.cuda.empty_cache()
        gc.collect()
        del sound_mapper, audio_generator
        gc.collect()
        torch.cuda.empty_cache()
        
    else:
        print("Processing single image...")
        view_result = generate_caption(lat, lon, view=args.view, model=args.model, 
                                     cpu_only=args.cpu_only, panoramic=False)
        if not view_result:
            print("Failed to generate caption for the view")
            return
        image_path = os.path.join(args.image_dir, f"{args.view}.jpg")
        if not os.path.exists(image_path):
            print(f"Error: Image file {image_path} not found")
            return
        print(f"Processing image: {image_path}")
        
        sound_mapper = SoundMapper()
        processed_maps = sound_mapper.process_depth_maps()
        image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")]
        image_basename = os.path.basename(image_path)
        image_index = [i for i, path in enumerate(image_paths) if os.path.basename(path) == image_basename]
        
        if not image_index:
            print(f"Could not find processed map for {image_basename}")
            return
            
        depth_map = processed_maps[image_index[0]]["normalization"]
        
        print("Detecting objects and their depths...")
        object_depths = sound_mapper.analyze_object_depths(
            image_path, depth_map, lat, lon, 
            caption_data=view_result,
            all_objects=True 
        )
        
        if not object_depths:
            print("No objects detected in the image.")
            return
        
        print(f"Detected {len(object_depths)} objects:")
        for obj in object_depths:
            print(f" - {obj['original_label']} (Zone: {obj['zone_description']}, Depth: {obj['mean_depth']:.4f})")
    
        print("Generating audio...")
        audio_generator = GenerateAudio()
        
        audio, sample_rate = audio_generator.process_and_generate_audio(
            object_depths,
            duration=args.audio_duration
        )
        
        if audio.dim() == 3:
            audio = audio.squeeze(0)
        elif audio.dim() == 1:
            audio = audio.unsqueeze(0)
        
        if audio.dim() != 2:
            raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D")
        
        output_path = os.path.join(args.output_dir, f"sound_{args.view}.wav")
        torchaudio.save(
            output_path,
            audio,
            sample_rate
        )
        
        print(f"Generated audio saved to: {output_path}")
    

if __name__ == "__main__":
    main()
    # Usage:
    #(For single image): python main.py --view front
    #(For panoramic images): python main.py --panoramic