Spaces:
Paused
Paused
File size: 8,430 Bytes
01115c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
"""Audio processing utilities for CSM-1B TTS API."""
import logging
import numpy as np
import torch
from scipy import signal
logger = logging.getLogger(__name__)
def remove_long_silences(
audio: torch.Tensor,
sample_rate: int,
min_speech_energy: float = 0.015,
max_silence_sec: float = 0.4,
keep_silence_sec: float = 0.1,
) -> torch.Tensor:
"""
Remove uncomfortably long silences from audio while preserving natural pauses.
Args:
audio: Audio tensor
sample_rate: Sample rate in Hz
min_speech_energy: Minimum RMS energy to consider as speech
max_silence_sec: Maximum silence duration to keep in seconds
keep_silence_sec: Amount of silence to keep at speech boundaries
Returns:
Audio with long silences removed
"""
# Convert to numpy for processing
audio_np = audio.cpu().numpy()
# Calculate frame size and hop length
frame_size = int(0.02 * sample_rate) # 20ms frames
hop_length = int(0.01 * sample_rate) # 10ms hop
# Compute frame energy
frames = []
for i in range(0, len(audio_np) - frame_size + 1, hop_length):
frames.append(audio_np[i:i+frame_size])
if len(frames) < 2: # If audio is too short for analysis
return audio
frames = np.array(frames)
# Root mean square energy
frame_energy = np.sqrt(np.mean(frames**2, axis=1))
# Adaptive threshold based on audio content
# Uses a percentile to adapt to different audio characteristics
energy_threshold = max(
min_speech_energy, # Minimum threshold
np.percentile(frame_energy, 10) # Adapt to audio
)
# Identify speech frames
is_speech = frame_energy > energy_threshold
# Convert frame indices to sample indices considering overlapping frames
speech_segments = []
in_speech = False
speech_start = 0
for i in range(len(is_speech)):
if is_speech[i] and not in_speech:
# Start of speech
in_speech = True
# Calculate start sample including keep_silence
speech_start = max(0, i * hop_length - int(keep_silence_sec * sample_rate))
elif not is_speech[i] and in_speech:
# Potential end of speech, look ahead to check if silence continues
silence_length = 0
for j in range(i, min(len(is_speech), i + int(max_silence_sec * sample_rate / hop_length))):
if not is_speech[j]:
silence_length += 1
else:
break
if silence_length * hop_length >= max_silence_sec * sample_rate:
# End of speech, long enough silence detected
in_speech = False
# Calculate end sample including keep_silence
speech_end = min(len(audio_np), i * hop_length + int(keep_silence_sec * sample_rate))
speech_segments.append((speech_start, speech_end))
# Handle the case where audio ends during speech
if in_speech:
speech_segments.append((speech_start, len(audio_np)))
if not speech_segments:
logger.warning("No speech segments detected, returning original audio")
return audio
# Combine speech segments with controlled silence durations
result = []
# Add initial silence if the first segment doesn't start at the beginning
if speech_segments[0][0] > 0:
# Add a short leading silence (100ms)
silence_samples = min(int(0.1 * sample_rate), speech_segments[0][0])
if silence_samples > 0:
result.append(audio_np[speech_segments[0][0] - silence_samples:speech_segments[0][0]])
# Process each speech segment
for i, (start, end) in enumerate(speech_segments):
# Add this speech segment
result.append(audio_np[start:end])
# Add a controlled silence between segments
if i < len(speech_segments) - 1:
next_start = speech_segments[i+1][0]
# Calculate available silence duration
available_silence = next_start - end
if available_silence > 0:
# Use either the actual silence (if shorter than max) or the max allowed
silence_duration = min(available_silence, int(max_silence_sec * sample_rate))
# Take the first portion of the silence - usually cleaner
result.append(audio_np[end:end + silence_duration])
# Combine all parts
processed_audio = np.concatenate(result)
# Log the results
original_duration = len(audio_np) / sample_rate
processed_duration = len(processed_audio) / sample_rate
logger.info(f"Silence removal: {original_duration:.2f}s -> {processed_duration:.2f}s ({processed_duration/original_duration*100:.1f}%)")
# Return as tensor with original device and dtype
return torch.tensor(processed_audio, device=audio.device, dtype=audio.dtype)
def create_high_shelf_filter(audio, sample_rate, frequency=4000, gain_db=3.0):
"""
Create a high shelf filter to boost frequencies above the given frequency.
Args:
audio: Audio numpy array
sample_rate: Sample rate in Hz
frequency: Shelf frequency in Hz
gain_db: Gain in dB for frequencies above the shelf
Returns:
Filtered audio
"""
# Convert gain from dB to linear
gain = 10 ** (gain_db / 20.0)
# Normalized frequency (0 to 1, where 1 is Nyquist frequency)
normalized_freq = 2.0 * frequency / sample_rate
# Design a high-shelf biquad filter
# This is a standard second-order section (SOS) implementation
b0 = gain
b1 = 0
b2 = 0
a0 = 1
a1 = 0
a2 = 0
# Simple first-order high-shelf filter
alpha = np.sin(np.pi * normalized_freq) / 2 * np.sqrt((gain + 1/gain) * (1/0.5 - 1) + 2)
cos_w0 = np.cos(np.pi * normalized_freq)
b0 = gain * ((gain + 1) + (gain - 1) * cos_w0 + 2 * np.sqrt(gain) * alpha)
b1 = -2 * gain * ((gain - 1) + (gain + 1) * cos_w0)
b2 = gain * ((gain + 1) + (gain - 1) * cos_w0 - 2 * np.sqrt(gain) * alpha)
a0 = (gain + 1) - (gain - 1) * cos_w0 + 2 * np.sqrt(gain) * alpha
a1 = 2 * ((gain - 1) - (gain + 1) * cos_w0)
a2 = (gain + 1) - (gain - 1) * cos_w0 - 2 * np.sqrt(gain) * alpha
# Normalize coefficients
b = np.array([b0, b1, b2]) / a0
a = np.array([1.0, a1/a0, a2/a0])
# Apply the filter
return signal.lfilter(b, a, audio)
def enhance_audio_quality(audio: torch.Tensor, sample_rate: int) -> torch.Tensor:
"""
Enhance audio quality by applying various processing techniques.
Args:
audio: Audio tensor
sample_rate: Sample rate in Hz
Returns:
Enhanced audio tensor
"""
try:
audio_np = audio.cpu().numpy()
# Remove DC offset
audio_np = audio_np - np.mean(audio_np)
# Apply light compression to improve perceived loudness
# Compress by reducing peaks and increasing quieter parts slightly
threshold = 0.5
ratio = 1.5
attack = 0.01
release = 0.1
# Simple compression algorithm
gain = np.ones_like(audio_np)
for i in range(1, len(audio_np)):
level = abs(audio_np[i])
if level > threshold:
gain[i] = threshold + (level - threshold) / ratio
gain[i] = gain[i] / level if level > 0 else 1.0
else:
gain[i] = 1.0
# Smooth gain changes
gain[i] = gain[i-1] + (gain[i] - gain[i-1]) * (attack if gain[i] < gain[i-1] else release)
audio_np = audio_np * gain
# Apply high-shelf filter to enhance speech clarity
# Boost frequencies above 4000 Hz by 3 dB
audio_np = create_high_shelf_filter(audio_np, sample_rate, frequency=4000, gain_db=3.0)
# Normalize to prevent clipping
max_val = np.max(np.abs(audio_np))
if max_val > 0:
audio_np = audio_np * 0.95 / max_val
return torch.tensor(audio_np, device=audio.device, dtype=audio.dtype)
except Exception as e:
logger.warning(f"Audio quality enhancement failed: {e}")
return audio |