Spaces:
Running
on
Zero
Running
on
Zero
import torchaudio | |
import sys | |
import torch | |
import random | |
from config import TANGO_FLUX_DIR | |
sys.path.append(TANGO_FLUX_DIR) | |
from tangoflux import TangoFluxInference | |
from transformers import AutoTokenizer, T5EncoderModel | |
from collections import Counter | |
class GenerateAudio(): | |
def __init__(self): | |
self.device = "cuda" | |
self.model = None | |
self.text_encoder = None | |
# Basic categories for object classification | |
self.categories = { | |
'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'], | |
'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'], | |
'urban': ['traffic', 'building', 'street', 'signal', 'construction'], | |
'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'], | |
'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'], | |
'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio'] | |
} | |
# Suffixes and prefixes for pattern matching | |
self.suffixes = { | |
'tree': 'nature', | |
'bird': 'animal', | |
'car': 'vehicle', | |
'truck': 'vehicle', | |
'signal': 'urban' | |
} | |
def _load_model(self): | |
if self.model is None: | |
self.model = TangoFluxInference(name='declare-lab/TangoFlux') | |
if self.text_encoder is None: | |
self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval() | |
else: | |
self.text_encoder = self.text_encoder.to(self.device) | |
def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True): | |
self._load_model() | |
with torch.no_grad(): | |
latents = self.model.model.inference_flow( | |
prompt, | |
duration=duration, | |
num_inference_steps=steps, | |
guidance_scale=guidance_scale, | |
disable_progress=disable_progress | |
) | |
wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0] | |
waveform_end = int(duration * self.model.vae.config.sampling_rate) | |
wave = wave[:, :waveform_end] | |
return wave | |
def _categorize_object(self, object_name): | |
"""Categorize an object based on keywords or patterns""" | |
object_lower = object_name.lower() | |
# Check if the object contains any category keywords | |
for category, keywords in self.categories.items(): | |
for keyword in keywords: | |
if keyword in object_lower: | |
return category | |
# Check suffix/prefix patterns | |
words = object_lower.split() | |
for word in words: | |
for suffix, category in self.suffixes.items(): | |
if word.endswith(suffix): | |
return category | |
return "unknown" | |
def _describe_object_sound(self, object_name, zone): | |
"""Generate an appropriate sound description based on object type and distance""" | |
category = self._categorize_object(object_name) | |
# Volume descriptor based on zone | |
volume_descriptors = { | |
"near": ["prominent", "clear", "loud", "distinct"], | |
"medium": ["moderate", "audible", "present"], | |
"far": ["subtle", "distant", "faint", "soft"] | |
} | |
volume = random.choice(volume_descriptors[zone]) | |
# Sound descriptors based on category | |
sound_templates = { | |
"vehicle": [ | |
"{volume} engine sounds from the {object}", | |
"{volume} mechanical noise of the {object}", | |
"the {object} creating {volume} road noise", | |
"{volume} sounds of the {object} in motion" | |
], | |
"nature": [ | |
"{volume} rustling of the {object}", | |
"the {object} making {volume} natural sounds", | |
"{volume} environmental sounds from the {object}", | |
"the {object} with {volume} movement in the wind" | |
], | |
"urban": [ | |
"{volume} urban sounds around the {object}", | |
"the {object} with {volume} city ambience", | |
"{volume} noise from the {object}", | |
"the {object} contributing to {volume} street sounds" | |
], | |
"animal": [ | |
"{volume} calls from the {object}", | |
"the {object} making {volume} animal sounds", | |
"{volume} sounds of the {object}", | |
"the {object} with its {volume} presence" | |
], | |
"human": [ | |
"{volume} voices from the {object}", | |
"the {object} creating {volume} human sounds", | |
"{volume} movement sounds from the {object}", | |
"the {object} with {volume} activity" | |
], | |
"indoor": [ | |
"{volume} ambient sounds around the {object}", | |
"the {object} making {volume} indoor noises", | |
"{volume} mechanical sounds from the {object}", | |
"the {object} with its {volume} presence" | |
], | |
"unknown": [ | |
"{volume} sounds from the {object}", | |
"the {object} creating {volume} audio", | |
"{volume} noises associated with the {object}", | |
"the {object} with its {volume} acoustic presence" | |
] | |
} | |
# Select a template for this category | |
templates = sound_templates.get(category, sound_templates["unknown"]) | |
template = random.choice(templates) | |
# Fill in the template | |
description = template.format(volume=volume, object=object_name) | |
return description | |
def create_audio_prompt(self, object_depths): | |
if not object_depths: | |
return "Environmental ambient sounds." | |
for obj in object_depths: | |
if obj.get("sound_description") and len(obj["sound_description"]) > 5: | |
return obj["sound_description"] | |
return f"Sounds of {object_depths[0]['original_label']}." | |
def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5): | |
self._load_model() | |
if not object_depths: | |
prompt = "Environmental ambient sounds." | |
else: | |
# Sort objects by depth to prioritize closer objects | |
sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"]) | |
prompt = self.create_audio_prompt(sorted_objects) | |
print(f"Generated audio prompt: {prompt}") | |
wave = self.generate_sound( | |
prompt, | |
steps=steps, | |
duration=duration, | |
guidance_scale=guidance_scale | |
) | |
sample_rate = self.model.vae.config.sampling_rate | |
if output_path: | |
torchaudio.save( | |
output_path, | |
wave.unsqueeze(0), | |
sample_rate | |
) | |
print(f"Audio saved to: {output_path}") | |
return wave, sample_rate |