SoundingStreet / GenerateAudio.py
FQiao's picture
Upload 70 files
3324de2 verified
import torchaudio
import sys
import torch
import random
from config import TANGO_FLUX_DIR
sys.path.append(TANGO_FLUX_DIR)
from tangoflux import TangoFluxInference
from transformers import AutoTokenizer, T5EncoderModel
from collections import Counter
class GenerateAudio():
def __init__(self):
self.device = "cuda"
self.model = None
self.text_encoder = None
# Basic categories for object classification
self.categories = {
'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'],
'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'],
'urban': ['traffic', 'building', 'street', 'signal', 'construction'],
'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'],
'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'],
'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio']
}
# Suffixes and prefixes for pattern matching
self.suffixes = {
'tree': 'nature',
'bird': 'animal',
'car': 'vehicle',
'truck': 'vehicle',
'signal': 'urban'
}
def _load_model(self):
if self.model is None:
self.model = TangoFluxInference(name='declare-lab/TangoFlux')
if self.text_encoder is None:
self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval()
else:
self.text_encoder = self.text_encoder.to(self.device)
def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True):
self._load_model()
with torch.no_grad():
latents = self.model.model.inference_flow(
prompt,
duration=duration,
num_inference_steps=steps,
guidance_scale=guidance_scale,
disable_progress=disable_progress
)
wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0]
waveform_end = int(duration * self.model.vae.config.sampling_rate)
wave = wave[:, :waveform_end]
return wave
def _categorize_object(self, object_name):
"""Categorize an object based on keywords or patterns"""
object_lower = object_name.lower()
# Check if the object contains any category keywords
for category, keywords in self.categories.items():
for keyword in keywords:
if keyword in object_lower:
return category
# Check suffix/prefix patterns
words = object_lower.split()
for word in words:
for suffix, category in self.suffixes.items():
if word.endswith(suffix):
return category
return "unknown"
def _describe_object_sound(self, object_name, zone):
"""Generate an appropriate sound description based on object type and distance"""
category = self._categorize_object(object_name)
# Volume descriptor based on zone
volume_descriptors = {
"near": ["prominent", "clear", "loud", "distinct"],
"medium": ["moderate", "audible", "present"],
"far": ["subtle", "distant", "faint", "soft"]
}
volume = random.choice(volume_descriptors[zone])
# Sound descriptors based on category
sound_templates = {
"vehicle": [
"{volume} engine sounds from the {object}",
"{volume} mechanical noise of the {object}",
"the {object} creating {volume} road noise",
"{volume} sounds of the {object} in motion"
],
"nature": [
"{volume} rustling of the {object}",
"the {object} making {volume} natural sounds",
"{volume} environmental sounds from the {object}",
"the {object} with {volume} movement in the wind"
],
"urban": [
"{volume} urban sounds around the {object}",
"the {object} with {volume} city ambience",
"{volume} noise from the {object}",
"the {object} contributing to {volume} street sounds"
],
"animal": [
"{volume} calls from the {object}",
"the {object} making {volume} animal sounds",
"{volume} sounds of the {object}",
"the {object} with its {volume} presence"
],
"human": [
"{volume} voices from the {object}",
"the {object} creating {volume} human sounds",
"{volume} movement sounds from the {object}",
"the {object} with {volume} activity"
],
"indoor": [
"{volume} ambient sounds around the {object}",
"the {object} making {volume} indoor noises",
"{volume} mechanical sounds from the {object}",
"the {object} with its {volume} presence"
],
"unknown": [
"{volume} sounds from the {object}",
"the {object} creating {volume} audio",
"{volume} noises associated with the {object}",
"the {object} with its {volume} acoustic presence"
]
}
# Select a template for this category
templates = sound_templates.get(category, sound_templates["unknown"])
template = random.choice(templates)
# Fill in the template
description = template.format(volume=volume, object=object_name)
return description
def create_audio_prompt(self, object_depths):
if not object_depths:
return "Environmental ambient sounds."
for obj in object_depths:
if obj.get("sound_description") and len(obj["sound_description"]) > 5:
return obj["sound_description"]
return f"Sounds of {object_depths[0]['original_label']}."
def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5):
self._load_model()
if not object_depths:
prompt = "Environmental ambient sounds."
else:
# Sort objects by depth to prioritize closer objects
sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"])
prompt = self.create_audio_prompt(sorted_objects)
print(f"Generated audio prompt: {prompt}")
wave = self.generate_sound(
prompt,
steps=steps,
duration=duration,
guidance_scale=guidance_scale
)
sample_rate = self.model.vae.config.sampling_rate
if output_path:
torchaudio.save(
output_path,
wave.unsqueeze(0),
sample_rate
)
print(f"Audio saved to: {output_path}")
return wave, sample_rate