SoundingStreet / main.py
FQiao's picture
Upload 70 files
3324de2 verified
import os
import argparse
from PIL import Image
import numpy as np
import torch
import torchaudio
import gc
from config import LOGS_DIR, OUTPUT_DIR
from DepthEstimator import DepthEstimator
from SoundMapper import SoundMapper
from GenerateAudio import GenerateAudio
from GenerateCaptions import generate_caption
from audio_mixer import compose_audio
def main():
parser = argparse.ArgumentParser(description="Generate sound from panoramic images")
parser.add_argument("--image_dir", type=str, default=LOGS_DIR, help="Directory containing input images")
parser.add_argument("--output_dir", type=str, default=OUTPUT_DIR, help="Directory for output files")
parser.add_argument("--audio_duration", type=int, default=10, help="Duration of generated audio in seconds")
parser.add_argument("--location", type=str, default="52.3436723,4.8529625", help='Location in format "latitude,longitude" (e.g., "40.7128,-74.0060")')
parser.add_argument("--view", type=str, default="front", choices=["front", "back", "left", "right"], help="Perspective view to analyze")
parser.add_argument("--model", type=str, default="intern_2_5-4B", help="Vision-language model to use for analysis")
parser.add_argument("--cpu_only", action="store_true", help="Force CPU usage even if CUDA is available")
parser.add_argument("--panoramic", action="store_true", default=False,
help="Process panoramic images instead of a single image")
args = parser.parse_args()
lat, lon = args.location.split(",")
os.makedirs(args.output_dir, exist_ok=True)
if args.panoramic:
print("-----------Processing panoramic images-----------")
# Generate captions for all views at once with panoramic=True
view_results = generate_caption(lat, lon, view=args.view, model=args.model,
cpu_only=args.cpu_only, panoramic=True)
if not view_results:
print("Failed to generate captions for panoramic views")
return
sound_mapper = SoundMapper()
processed_maps = sound_mapper.process_depth_maps()
image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")]
# Create audio generator
audio_generator = GenerateAudio()
sound_tracks_dict = {} # keep track of sound tracks and their weight
# Process each view
for i, view_result in enumerate(view_results):
current_view = view_result["view"]
print(f"Processing {current_view} view ({i+1}/{len(view_results)})")
# Find corresponding image path for this view
image_path = os.path.join(args.image_dir, f"{current_view}.jpg")
if not os.path.exists(image_path):
print(f"Warning: Image file {image_path} not found")
continue
image_index = [idx for idx, path in enumerate(image_paths)
if os.path.basename(path) == f"{current_view}.jpg"]
if not image_index:
print(f"Could not find processed map for {current_view} view")
continue
depth_map = processed_maps[image_index[0]]["normalization"]
object_depths = sound_mapper.analyze_object_depths(
image_path, depth_map, lat, lon,
caption_data=view_result,
all_objects=False
)
if not object_depths:
print(f"No objects detected in the {current_view} view")
continue
# Generate audio for this view
output_path = os.path.join(args.output_dir, f"sound_{current_view}.wav")
print(f"Generating audio for {current_view} view...")
audio, sample_rate = audio_generator.process_and_generate_audio(
object_depths,
duration=args.audio_duration
)
if audio.dim() == 3:
audio = audio.squeeze(0)
elif audio.dim() == 1:
audio = audio.unsqueeze(0)
if audio.dim() != 2:
raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D")
torchaudio.save(
output_path,
audio,
sample_rate
)
if object_depths:
sound_tracks_dict[output_path] = object_depths[0]['weight']
print(f"Generated audio saved to: {output_path}")
print("-" * 50)
if sound_tracks_dict:
print("Composing final audio from all views...")
compose_audio(
list(sound_tracks_dict.keys()),
list(sound_tracks_dict.values()),
os.path.join(args.output_dir, "panoramic_composition.wav")
)
print(f"Final audio composition saved to: {os.path.join(args.output_dir, 'panoramic_composition.wav')}")
torch.cuda.empty_cache()
gc.collect()
del sound_mapper, audio_generator
gc.collect()
torch.cuda.empty_cache()
else:
print("Processing single image...")
view_result = generate_caption(lat, lon, view=args.view, model=args.model,
cpu_only=args.cpu_only, panoramic=False)
if not view_result:
print("Failed to generate caption for the view")
return
image_path = os.path.join(args.image_dir, f"{args.view}.jpg")
if not os.path.exists(image_path):
print(f"Error: Image file {image_path} not found")
return
print(f"Processing image: {image_path}")
sound_mapper = SoundMapper()
processed_maps = sound_mapper.process_depth_maps()
image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")]
image_basename = os.path.basename(image_path)
image_index = [i for i, path in enumerate(image_paths) if os.path.basename(path) == image_basename]
if not image_index:
print(f"Could not find processed map for {image_basename}")
return
depth_map = processed_maps[image_index[0]]["normalization"]
print("Detecting objects and their depths...")
object_depths = sound_mapper.analyze_object_depths(
image_path, depth_map, lat, lon,
caption_data=view_result,
all_objects=True
)
if not object_depths:
print("No objects detected in the image.")
return
print(f"Detected {len(object_depths)} objects:")
for obj in object_depths:
print(f" - {obj['original_label']} (Zone: {obj['zone_description']}, Depth: {obj['mean_depth']:.4f})")
print("Generating audio...")
audio_generator = GenerateAudio()
audio, sample_rate = audio_generator.process_and_generate_audio(
object_depths,
duration=args.audio_duration
)
if audio.dim() == 3:
audio = audio.squeeze(0)
elif audio.dim() == 1:
audio = audio.unsqueeze(0)
if audio.dim() != 2:
raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D")
output_path = os.path.join(args.output_dir, f"sound_{args.view}.wav")
torchaudio.save(
output_path,
audio,
sample_rate
)
print(f"Generated audio saved to: {output_path}")
if __name__ == "__main__":
main()
# Usage:
#(For single image): python main.py --view front
#(For panoramic images): python main.py --panoramic