import torchaudio import sys import torch import random from config import TANGO_FLUX_DIR sys.path.append(TANGO_FLUX_DIR) from tangoflux import TangoFluxInference from transformers import AutoTokenizer, T5EncoderModel from collections import Counter class GenerateAudio(): def __init__(self): self.device = "cuda" self.model = None self.text_encoder = None # Basic categories for object classification self.categories = { 'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'], 'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'], 'urban': ['traffic', 'building', 'street', 'signal', 'construction'], 'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'], 'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'], 'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio'] } # Suffixes and prefixes for pattern matching self.suffixes = { 'tree': 'nature', 'bird': 'animal', 'car': 'vehicle', 'truck': 'vehicle', 'signal': 'urban' } def _load_model(self): if self.model is None: self.model = TangoFluxInference(name='declare-lab/TangoFlux') if self.text_encoder is None: self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval() else: self.text_encoder = self.text_encoder.to(self.device) def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True): self._load_model() with torch.no_grad(): latents = self.model.model.inference_flow( prompt, duration=duration, num_inference_steps=steps, guidance_scale=guidance_scale, disable_progress=disable_progress ) wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0] waveform_end = int(duration * self.model.vae.config.sampling_rate) wave = wave[:, :waveform_end] return wave def _categorize_object(self, object_name): """Categorize an object based on keywords or patterns""" object_lower = object_name.lower() # Check if the object contains any category keywords for category, keywords in self.categories.items(): for keyword in keywords: if keyword in object_lower: return category # Check suffix/prefix patterns words = object_lower.split() for word in words: for suffix, category in self.suffixes.items(): if word.endswith(suffix): return category return "unknown" def _describe_object_sound(self, object_name, zone): """Generate an appropriate sound description based on object type and distance""" category = self._categorize_object(object_name) # Volume descriptor based on zone volume_descriptors = { "near": ["prominent", "clear", "loud", "distinct"], "medium": ["moderate", "audible", "present"], "far": ["subtle", "distant", "faint", "soft"] } volume = random.choice(volume_descriptors[zone]) # Sound descriptors based on category sound_templates = { "vehicle": [ "{volume} engine sounds from the {object}", "{volume} mechanical noise of the {object}", "the {object} creating {volume} road noise", "{volume} sounds of the {object} in motion" ], "nature": [ "{volume} rustling of the {object}", "the {object} making {volume} natural sounds", "{volume} environmental sounds from the {object}", "the {object} with {volume} movement in the wind" ], "urban": [ "{volume} urban sounds around the {object}", "the {object} with {volume} city ambience", "{volume} noise from the {object}", "the {object} contributing to {volume} street sounds" ], "animal": [ "{volume} calls from the {object}", "the {object} making {volume} animal sounds", "{volume} sounds of the {object}", "the {object} with its {volume} presence" ], "human": [ "{volume} voices from the {object}", "the {object} creating {volume} human sounds", "{volume} movement sounds from the {object}", "the {object} with {volume} activity" ], "indoor": [ "{volume} ambient sounds around the {object}", "the {object} making {volume} indoor noises", "{volume} mechanical sounds from the {object}", "the {object} with its {volume} presence" ], "unknown": [ "{volume} sounds from the {object}", "the {object} creating {volume} audio", "{volume} noises associated with the {object}", "the {object} with its {volume} acoustic presence" ] } # Select a template for this category templates = sound_templates.get(category, sound_templates["unknown"]) template = random.choice(templates) # Fill in the template description = template.format(volume=volume, object=object_name) return description def create_audio_prompt(self, object_depths): if not object_depths: return "Environmental ambient sounds." for obj in object_depths: if obj.get("sound_description") and len(obj["sound_description"]) > 5: return obj["sound_description"] return f"Sounds of {object_depths[0]['original_label']}." def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5): self._load_model() if not object_depths: prompt = "Environmental ambient sounds." else: # Sort objects by depth to prioritize closer objects sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"]) prompt = self.create_audio_prompt(sorted_objects) print(f"Generated audio prompt: {prompt}") wave = self.generate_sound( prompt, steps=steps, duration=duration, guidance_scale=guidance_scale ) sample_rate = self.model.vae.config.sampling_rate if output_path: torchaudio.save( output_path, wave.unsqueeze(0), sample_rate ) print(f"Audio saved to: {output_path}") return wave, sample_rate