import os import argparse from PIL import Image import numpy as np import torch import torchaudio import gc from config import LOGS_DIR, OUTPUT_DIR from DepthEstimator import DepthEstimator from SoundMapper import SoundMapper from GenerateAudio import GenerateAudio from GenerateCaptions import generate_caption from audio_mixer import compose_audio def main(): parser = argparse.ArgumentParser(description="Generate sound from panoramic images") parser.add_argument("--image_dir", type=str, default=LOGS_DIR, help="Directory containing input images") parser.add_argument("--output_dir", type=str, default=OUTPUT_DIR, help="Directory for output files") parser.add_argument("--audio_duration", type=int, default=10, help="Duration of generated audio in seconds") parser.add_argument("--location", type=str, default="52.3436723,4.8529625", help='Location in format "latitude,longitude" (e.g., "40.7128,-74.0060")') parser.add_argument("--view", type=str, default="front", choices=["front", "back", "left", "right"], help="Perspective view to analyze") parser.add_argument("--model", type=str, default="intern_2_5-4B", help="Vision-language model to use for analysis") parser.add_argument("--cpu_only", action="store_true", help="Force CPU usage even if CUDA is available") parser.add_argument("--panoramic", action="store_true", default=False, help="Process panoramic images instead of a single image") args = parser.parse_args() lat, lon = args.location.split(",") os.makedirs(args.output_dir, exist_ok=True) if args.panoramic: print("-----------Processing panoramic images-----------") # Generate captions for all views at once with panoramic=True view_results = generate_caption(lat, lon, view=args.view, model=args.model, cpu_only=args.cpu_only, panoramic=True) if not view_results: print("Failed to generate captions for panoramic views") return sound_mapper = SoundMapper() processed_maps = sound_mapper.process_depth_maps() image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")] # Create audio generator audio_generator = GenerateAudio() sound_tracks_dict = {} # keep track of sound tracks and their weight # Process each view for i, view_result in enumerate(view_results): current_view = view_result["view"] print(f"Processing {current_view} view ({i+1}/{len(view_results)})") # Find corresponding image path for this view image_path = os.path.join(args.image_dir, f"{current_view}.jpg") if not os.path.exists(image_path): print(f"Warning: Image file {image_path} not found") continue image_index = [idx for idx, path in enumerate(image_paths) if os.path.basename(path) == f"{current_view}.jpg"] if not image_index: print(f"Could not find processed map for {current_view} view") continue depth_map = processed_maps[image_index[0]]["normalization"] object_depths = sound_mapper.analyze_object_depths( image_path, depth_map, lat, lon, caption_data=view_result, all_objects=False ) if not object_depths: print(f"No objects detected in the {current_view} view") continue # Generate audio for this view output_path = os.path.join(args.output_dir, f"sound_{current_view}.wav") print(f"Generating audio for {current_view} view...") audio, sample_rate = audio_generator.process_and_generate_audio( object_depths, duration=args.audio_duration ) if audio.dim() == 3: audio = audio.squeeze(0) elif audio.dim() == 1: audio = audio.unsqueeze(0) if audio.dim() != 2: raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D") torchaudio.save( output_path, audio, sample_rate ) if object_depths: sound_tracks_dict[output_path] = object_depths[0]['weight'] print(f"Generated audio saved to: {output_path}") print("-" * 50) if sound_tracks_dict: print("Composing final audio from all views...") compose_audio( list(sound_tracks_dict.keys()), list(sound_tracks_dict.values()), os.path.join(args.output_dir, "panoramic_composition.wav") ) print(f"Final audio composition saved to: {os.path.join(args.output_dir, 'panoramic_composition.wav')}") torch.cuda.empty_cache() gc.collect() del sound_mapper, audio_generator gc.collect() torch.cuda.empty_cache() else: print("Processing single image...") view_result = generate_caption(lat, lon, view=args.view, model=args.model, cpu_only=args.cpu_only, panoramic=False) if not view_result: print("Failed to generate caption for the view") return image_path = os.path.join(args.image_dir, f"{args.view}.jpg") if not os.path.exists(image_path): print(f"Error: Image file {image_path} not found") return print(f"Processing image: {image_path}") sound_mapper = SoundMapper() processed_maps = sound_mapper.process_depth_maps() image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")] image_basename = os.path.basename(image_path) image_index = [i for i, path in enumerate(image_paths) if os.path.basename(path) == image_basename] if not image_index: print(f"Could not find processed map for {image_basename}") return depth_map = processed_maps[image_index[0]]["normalization"] print("Detecting objects and their depths...") object_depths = sound_mapper.analyze_object_depths( image_path, depth_map, lat, lon, caption_data=view_result, all_objects=True ) if not object_depths: print("No objects detected in the image.") return print(f"Detected {len(object_depths)} objects:") for obj in object_depths: print(f" - {obj['original_label']} (Zone: {obj['zone_description']}, Depth: {obj['mean_depth']:.4f})") print("Generating audio...") audio_generator = GenerateAudio() audio, sample_rate = audio_generator.process_and_generate_audio( object_depths, duration=args.audio_duration ) if audio.dim() == 3: audio = audio.squeeze(0) elif audio.dim() == 1: audio = audio.unsqueeze(0) if audio.dim() != 2: raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D") output_path = os.path.join(args.output_dir, f"sound_{args.view}.wav") torchaudio.save( output_path, audio, sample_rate ) print(f"Generated audio saved to: {output_path}") if __name__ == "__main__": main() # Usage: #(For single image): python main.py --view front #(For panoramic images): python main.py --panoramic