Spaces:

FQiao
/

SoundingStreet

Running on Zero

App Files Files Community

SoundingStreet / GenerateAudio.py

FQiao

Upload 70 files

3324de2 verified 5 days ago

raw

history blame contribute delete

7.27 kB

	import torchaudio
	import sys
	import torch
	import random
	from config import TANGO_FLUX_DIR
	sys.path.append(TANGO_FLUX_DIR)
	from tangoflux import TangoFluxInference
	from transformers import AutoTokenizer, T5EncoderModel
	from collections import Counter

	class GenerateAudio():
	def __init__(self):
	self.device = "cuda"
	self.model = None
	self.text_encoder = None

	# Basic categories for object classification
	self.categories = {
	'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'],
	'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'],
	'urban': ['traffic', 'building', 'street', 'signal', 'construction'],
	'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'],
	'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'],
	'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio']
	}

	# Suffixes and prefixes for pattern matching
	self.suffixes = {
	'tree': 'nature',
	'bird': 'animal',
	'car': 'vehicle',
	'truck': 'vehicle',
	'signal': 'urban'
	}

	def _load_model(self):
	if self.model is None:
	self.model = TangoFluxInference(name='declare-lab/TangoFlux')
	if self.text_encoder is None:
	self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval()
	else:
	self.text_encoder = self.text_encoder.to(self.device)

	def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True):
	self._load_model()
	with torch.no_grad():
	latents = self.model.model.inference_flow(
	prompt,
	duration=duration,
	num_inference_steps=steps,
	guidance_scale=guidance_scale,
	disable_progress=disable_progress
	)
	wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0]
	waveform_end = int(duration * self.model.vae.config.sampling_rate)
	wave = wave[:, :waveform_end]

	return wave

	def _categorize_object(self, object_name):
	"""Categorize an object based on keywords or patterns"""
	object_lower = object_name.lower()

	# Check if the object contains any category keywords
	for category, keywords in self.categories.items():
	for keyword in keywords:
	if keyword in object_lower:
	return category

	# Check suffix/prefix patterns
	words = object_lower.split()
	for word in words:
	for suffix, category in self.suffixes.items():
	if word.endswith(suffix):
	return category

	return "unknown"

	def _describe_object_sound(self, object_name, zone):
	"""Generate an appropriate sound description based on object type and distance"""
	category = self._categorize_object(object_name)

	# Volume descriptor based on zone
	volume_descriptors = {
	"near": ["prominent", "clear", "loud", "distinct"],
	"medium": ["moderate", "audible", "present"],
	"far": ["subtle", "distant", "faint", "soft"]
	}

	volume = random.choice(volume_descriptors[zone])

	# Sound descriptors based on category
	sound_templates = {
	"vehicle": [
	"{volume} engine sounds from the {object}",
	"{volume} mechanical noise of the {object}",
	"the {object} creating {volume} road noise",
	"{volume} sounds of the {object} in motion"
	],
	"nature": [
	"{volume} rustling of the {object}",
	"the {object} making {volume} natural sounds",
	"{volume} environmental sounds from the {object}",
	"the {object} with {volume} movement in the wind"
	],
	"urban": [
	"{volume} urban sounds around the {object}",
	"the {object} with {volume} city ambience",
	"{volume} noise from the {object}",
	"the {object} contributing to {volume} street sounds"
	],
	"animal": [
	"{volume} calls from the {object}",
	"the {object} making {volume} animal sounds",
	"{volume} sounds of the {object}",
	"the {object} with its {volume} presence"
	],
	"human": [
	"{volume} voices from the {object}",
	"the {object} creating {volume} human sounds",
	"{volume} movement sounds from the {object}",
	"the {object} with {volume} activity"
	],
	"indoor": [
	"{volume} ambient sounds around the {object}",
	"the {object} making {volume} indoor noises",
	"{volume} mechanical sounds from the {object}",
	"the {object} with its {volume} presence"
	],
	"unknown": [
	"{volume} sounds from the {object}",
	"the {object} creating {volume} audio",
	"{volume} noises associated with the {object}",
	"the {object} with its {volume} acoustic presence"
	]
	}

	# Select a template for this category
	templates = sound_templates.get(category, sound_templates["unknown"])
	template = random.choice(templates)

	# Fill in the template
	description = template.format(volume=volume, object=object_name)
	return description

	def create_audio_prompt(self, object_depths):
	if not object_depths:
	return "Environmental ambient sounds."

	for obj in object_depths:
	if obj.get("sound_description") and len(obj["sound_description"]) > 5:
	return obj["sound_description"]
	return f"Sounds of {object_depths[0]['original_label']}."

	def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5):
	self._load_model()

	if not object_depths:
	prompt = "Environmental ambient sounds."
	else:
	# Sort objects by depth to prioritize closer objects
	sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"])
	prompt = self.create_audio_prompt(sorted_objects)

	print(f"Generated audio prompt: {prompt}")

	wave = self.generate_sound(
	prompt,
	steps=steps,
	duration=duration,
	guidance_scale=guidance_scale
	)

	sample_rate = self.model.vae.config.sampling_rate

	if output_path:
	torchaudio.save(
	output_path,
	wave.unsqueeze(0),
	sample_rate
	)
	print(f"Audio saved to: {output_path}")

	return wave, sample_rate