chapter-llama / tools /extract /asr_faster_whisper.py
lucas-ventura's picture
Create asr_faster_whisper.py
4771930 verified
from pathlib import Path
import torch
from faster_whisper import WhisperModel
from src.data.chapters import sec_to_hms
# Set device and disable TF32 for consistent results
device = "cuda" if torch.cuda.is_available() else "cpu"
class ASRProcessor:
"""
Automatic Speech Recognition processor using WhisperX.
Transcribes audio files and returns time-aligned transcription segments.
"""
def __init__(self, model_name="large-v2", compute_type="float16"):
self.model_name = model_name
self.model = WhisperModel(model_name, device=device, compute_type=compute_type)
def get_asr(self, audio_file, return_duration=True):
assert Path(audio_file).exists(), f"File {audio_file} does not exist"
segments, info = self.model.transcribe(
audio_file, length_penalty=0.5, condition_on_previous_text=False
)
asr_clean = []
for segment in segments:
t = segment.text.strip()
s = sec_to_hms(segment.start)
asr_clean.append(f"{s}: {t}")
if return_duration:
return "\n".join(asr_clean) + "\n", info.duration
else:
return "\n".join(asr_clean) + "\n"