AnyaSchen commited on
Commit
9c17134
·
1 Parent(s): 7db1cf9

fix:all segments to check in lang detection

Browse files
requirements.txt CHANGED
@@ -5,7 +5,7 @@ numpy>=1.21.0
5
  ffmpeg-python>=0.2.0
6
  torch>=2.0.0
7
  torchaudio>=2.0.0
8
- faster-whisper>=0.9.0
9
  websockets>=10.0
10
  pydantic>=1.8.0
11
  python-dotenv>=0.19.0
 
5
  ffmpeg-python>=0.2.0
6
  torch>=2.0.0
7
  torchaudio>=2.0.0
8
+ faster-whisper
9
  websockets>=10.0
10
  pydantic>=1.8.0
11
  python-dotenv>=0.19.0
whisper_streaming_custom/backends.py CHANGED
@@ -141,6 +141,8 @@ class FasterWhisperASR(ASRBase):
141
  device = "cuda" if torch and torch.cuda.is_available() else "cpu"
142
  compute_type = "float16" if device == "cuda" else "float32"
143
 
 
 
144
  model = WhisperModel(
145
  model_size_or_path,
146
  device=device,
@@ -152,7 +154,7 @@ class FasterWhisperASR(ASRBase):
152
  def transcribe(self, audio: np.ndarray, init_prompt: str = "") -> list:
153
  segments, info = self.model.transcribe(
154
  audio,
155
- language=self.original_language,
156
  initial_prompt=init_prompt,
157
  beam_size=5,
158
  word_timestamps=True,
@@ -181,6 +183,8 @@ class FasterWhisperASR(ASRBase):
181
  self.transcribe_kargs["task"] = "translate"
182
 
183
  def detect_language(self, audio_file_path):
 
 
184
  """
185
  Detect the language of the audio using faster-whisper's language detection.
186
 
@@ -194,17 +198,18 @@ class FasterWhisperASR(ASRBase):
194
  - probabilities (dict): Dictionary of language probabilities
195
  """
196
  try:
197
- # Load audio using soundfile
198
- audio, sr = sf.read(audio_file_path)
199
-
200
- # Convert to format expected by Whisper (16-bit PCM)
201
- audio = (audio * 32768).astype(np.int16)
202
 
 
 
 
 
 
203
  # Use faster-whisper's detect_language method
204
  language, language_probability, all_language_probs = self.model.detect_language(
205
  audio=audio,
206
  vad_filter=False, # Disable VAD for language detection
207
- language_detection_segments=1, # Use single segment for detection
208
  language_detection_threshold=0.5 # Default threshold
209
  )
210
 
 
141
  device = "cuda" if torch and torch.cuda.is_available() else "cpu"
142
  compute_type = "float16" if device == "cuda" else "float32"
143
 
144
+ logger.info(f"Loading whisper model {model_size_or_path} on {device} with compute type {compute_type}")
145
+
146
  model = WhisperModel(
147
  model_size_or_path,
148
  device=device,
 
154
  def transcribe(self, audio: np.ndarray, init_prompt: str = "") -> list:
155
  segments, info = self.model.transcribe(
156
  audio,
157
+ language=None,
158
  initial_prompt=init_prompt,
159
  beam_size=5,
160
  word_timestamps=True,
 
183
  self.transcribe_kargs["task"] = "translate"
184
 
185
  def detect_language(self, audio_file_path):
186
+
187
+ from faster_whisper.audio import decode_audio
188
  """
189
  Detect the language of the audio using faster-whisper's language detection.
190
 
 
198
  - probabilities (dict): Dictionary of language probabilities
199
  """
200
  try:
201
+ audio = decode_audio(audio_file_path, sampling_rate=self.model.feature_extractor.sampling_rate)
 
 
 
 
202
 
203
+ # Calculate total number of segments (each segment is 30 seconds)
204
+ audio_duration = len(audio) / self.model.feature_extractor.sampling_rate
205
+ segments_num = max(1, int(audio_duration / 30)) # At least 1 segment
206
+ logger.info(f"Audio duration: {audio_duration:.2f}s, using {segments_num} segments for language detection")
207
+
208
  # Use faster-whisper's detect_language method
209
  language, language_probability, all_language_probs = self.model.detect_language(
210
  audio=audio,
211
  vad_filter=False, # Disable VAD for language detection
212
+ language_detection_segments=segments_num, # Use all possible segments
213
  language_detection_threshold=0.5 # Default threshold
214
  )
215