Spaces:
Paused
Paused
fix:all segments to check in lang detection
Browse files- requirements.txt +1 -1
- whisper_streaming_custom/backends.py +12 -7
requirements.txt
CHANGED
@@ -5,7 +5,7 @@ numpy>=1.21.0
|
|
5 |
ffmpeg-python>=0.2.0
|
6 |
torch>=2.0.0
|
7 |
torchaudio>=2.0.0
|
8 |
-
faster-whisper
|
9 |
websockets>=10.0
|
10 |
pydantic>=1.8.0
|
11 |
python-dotenv>=0.19.0
|
|
|
5 |
ffmpeg-python>=0.2.0
|
6 |
torch>=2.0.0
|
7 |
torchaudio>=2.0.0
|
8 |
+
faster-whisper
|
9 |
websockets>=10.0
|
10 |
pydantic>=1.8.0
|
11 |
python-dotenv>=0.19.0
|
whisper_streaming_custom/backends.py
CHANGED
@@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
|
|
141 |
device = "cuda" if torch and torch.cuda.is_available() else "cpu"
|
142 |
compute_type = "float16" if device == "cuda" else "float32"
|
143 |
|
|
|
|
|
144 |
model = WhisperModel(
|
145 |
model_size_or_path,
|
146 |
device=device,
|
@@ -152,7 +154,7 @@ class FasterWhisperASR(ASRBase):
|
|
152 |
def transcribe(self, audio: np.ndarray, init_prompt: str = "") -> list:
|
153 |
segments, info = self.model.transcribe(
|
154 |
audio,
|
155 |
-
language=
|
156 |
initial_prompt=init_prompt,
|
157 |
beam_size=5,
|
158 |
word_timestamps=True,
|
@@ -181,6 +183,8 @@ class FasterWhisperASR(ASRBase):
|
|
181 |
self.transcribe_kargs["task"] = "translate"
|
182 |
|
183 |
def detect_language(self, audio_file_path):
|
|
|
|
|
184 |
"""
|
185 |
Detect the language of the audio using faster-whisper's language detection.
|
186 |
|
@@ -194,17 +198,18 @@ class FasterWhisperASR(ASRBase):
|
|
194 |
- probabilities (dict): Dictionary of language probabilities
|
195 |
"""
|
196 |
try:
|
197 |
-
|
198 |
-
audio, sr = sf.read(audio_file_path)
|
199 |
-
|
200 |
-
# Convert to format expected by Whisper (16-bit PCM)
|
201 |
-
audio = (audio * 32768).astype(np.int16)
|
202 |
|
|
|
|
|
|
|
|
|
|
|
203 |
# Use faster-whisper's detect_language method
|
204 |
language, language_probability, all_language_probs = self.model.detect_language(
|
205 |
audio=audio,
|
206 |
vad_filter=False, # Disable VAD for language detection
|
207 |
-
language_detection_segments=
|
208 |
language_detection_threshold=0.5 # Default threshold
|
209 |
)
|
210 |
|
|
|
141 |
device = "cuda" if torch and torch.cuda.is_available() else "cpu"
|
142 |
compute_type = "float16" if device == "cuda" else "float32"
|
143 |
|
144 |
+
logger.info(f"Loading whisper model {model_size_or_path} on {device} with compute type {compute_type}")
|
145 |
+
|
146 |
model = WhisperModel(
|
147 |
model_size_or_path,
|
148 |
device=device,
|
|
|
154 |
def transcribe(self, audio: np.ndarray, init_prompt: str = "") -> list:
|
155 |
segments, info = self.model.transcribe(
|
156 |
audio,
|
157 |
+
language=None,
|
158 |
initial_prompt=init_prompt,
|
159 |
beam_size=5,
|
160 |
word_timestamps=True,
|
|
|
183 |
self.transcribe_kargs["task"] = "translate"
|
184 |
|
185 |
def detect_language(self, audio_file_path):
|
186 |
+
|
187 |
+
from faster_whisper.audio import decode_audio
|
188 |
"""
|
189 |
Detect the language of the audio using faster-whisper's language detection.
|
190 |
|
|
|
198 |
- probabilities (dict): Dictionary of language probabilities
|
199 |
"""
|
200 |
try:
|
201 |
+
audio = decode_audio(audio_file_path, sampling_rate=self.model.feature_extractor.sampling_rate)
|
|
|
|
|
|
|
|
|
202 |
|
203 |
+
# Calculate total number of segments (each segment is 30 seconds)
|
204 |
+
audio_duration = len(audio) / self.model.feature_extractor.sampling_rate
|
205 |
+
segments_num = max(1, int(audio_duration / 30)) # At least 1 segment
|
206 |
+
logger.info(f"Audio duration: {audio_duration:.2f}s, using {segments_num} segments for language detection")
|
207 |
+
|
208 |
# Use faster-whisper's detect_language method
|
209 |
language, language_probability, all_language_probs = self.model.detect_language(
|
210 |
audio=audio,
|
211 |
vad_filter=False, # Disable VAD for language detection
|
212 |
+
language_detection_segments=segments_num, # Use all possible segments
|
213 |
language_detection_threshold=0.5 # Default threshold
|
214 |
)
|
215 |
|