File size: 1,204 Bytes
4771930
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from pathlib import Path

import torch
from faster_whisper import WhisperModel

from src.data.chapters import sec_to_hms

# Set device and disable TF32 for consistent results
device = "cuda" if torch.cuda.is_available() else "cpu"


class ASRProcessor:
    """
    Automatic Speech Recognition processor using WhisperX.

    Transcribes audio files and returns time-aligned transcription segments.
    """

    def __init__(self, model_name="large-v2", compute_type="float16"):
        self.model_name = model_name
        self.model = WhisperModel(model_name, device=device, compute_type=compute_type)

    def get_asr(self, audio_file, return_duration=True):
        assert Path(audio_file).exists(), f"File {audio_file} does not exist"
        segments, info = self.model.transcribe(
            audio_file, length_penalty=0.5, condition_on_previous_text=False
        )

        asr_clean = []
        for segment in segments:
            t = segment.text.strip()
            s = sec_to_hms(segment.start)
            asr_clean.append(f"{s}: {t}")

        if return_duration:
            return "\n".join(asr_clean) + "\n", info.duration
        else:
            return "\n".join(asr_clean) + "\n"