Spaces:

jameszokah
/

jamiya

Running

App Files Files Community

jamiya / app /voice_embeddings.py

jameszokah

init commit

383520d 24 days ago

raw

history blame contribute delete

3.45 kB

	"""Voice embeddings for consistent voice generation."""
	import os
	import torch
	import torchaudio
	import numpy as np
	from typing import Dict

	# Path to store voice samples
	VOICE_SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "voice_samples")
	os.makedirs(VOICE_SAMPLES_DIR, exist_ok=True)

	# Dictionary to store voice embeddings/samples
	VOICE_DICT: Dict[str, torch.Tensor] = {}


	def initialize_voices(sample_rate: int = 24000):
	"""Initialize voice dictionary with consistent samples."""
	# Generate consistent seed audio for each voice
	for voice_id in range(6):
	voice_name = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"][voice_id]

	# Create deterministic audio sample for each voice
	np.random.seed(voice_id + 42) # Use a fixed seed based on voice ID

	# Generate 1 second of "seed" audio with deterministic characteristics
	# This differs per voice but remains consistent across runs
	duration = 1.0 # seconds
	t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)

	# Create a distinctive waveform for each voice
	if voice_id == 0: # alloy - rich mid tones
	freq1, freq2 = 220, 440
	audio = 0.5 * np.sin(2 * np.pi * freq1 * t) + 0.3 * np.sin(2 * np.pi * freq2 * t)
	elif voice_id == 1: # echo - reverberant
	freq = 330
	audio = np.sin(2 * np.pi * freq * t) * np.exp(-t * 3)
	elif voice_id == 2: # fable - bright, higher pitch
	freq = 523
	audio = 0.7 * np.sin(2 * np.pi * freq * t)
	elif voice_id == 3: # onyx - deep and resonant
	freq = 165
	audio = 0.8 * np.sin(2 * np.pi * freq * t)
	elif voice_id == 4: # nova - warm and smooth
	freq1, freq2 = 392, 196
	audio = 0.4 * np.sin(2 * np.pi * freq1 * t) + 0.4 * np.sin(2 * np.pi * freq2 * t)
	else: # shimmer - airy and light
	freq1, freq2, freq3 = 587, 880, 1174
	audio = 0.3 * np.sin(2 * np.pi * freq1 * t) + 0.2 * np.sin(2 * np.pi * freq2 * t) + 0.1 * np.sin(2 * np.pi * freq3 * t)

	# Normalize
	audio = audio / np.max(np.abs(audio))

	# Convert to tensor
	audio_tensor = torch.tensor(audio, dtype=torch.float32)

	# Store the audio tensor
	VOICE_DICT[voice_name] = audio_tensor

	# Save as wav for reference
	save_path = os.path.join(VOICE_SAMPLES_DIR, f"{voice_name}_seed.wav")
	torchaudio.save(save_path, audio_tensor.unsqueeze(0), sample_rate)

	print(f"Initialized voice seed for {voice_name}")


	def get_voice_sample(voice_name: str) -> torch.Tensor:
	"""Get the voice sample for a given voice name."""
	if not VOICE_DICT:
	initialize_voices()

	if voice_name in VOICE_DICT:
	return VOICE_DICT[voice_name]

	# Default to alloy if voice not found
	print(f"Voice {voice_name} not found, defaulting to alloy")
	return VOICE_DICT["alloy"]


	def update_voice_sample(voice_name: str, audio: torch.Tensor):
	"""Update the voice sample with recently generated audio."""
	# Only update if we've already initialized
	if VOICE_DICT:
	# Take the last second of audio (or whatever is available)
	sample_length = min(24000, audio.shape[0])
	VOICE_DICT[voice_name] = audio[-sample_length:].detach().cpu()