|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import soundfile as sf |
|
import librosa |
|
import time |
|
from transformers import pipeline |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="antony66/whisper-large-v3-russian", |
|
torch_dtype=torch.float16, |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
|
|
def transcribe(audio_data): |
|
log_messages = [] |
|
|
|
start_time = time.time() |
|
log_messages.append("Загрузка файла...") |
|
|
|
if audio_data is None: |
|
return "Ошибка: не получены аудиоданные", "\n".join(log_messages) |
|
|
|
wav_file = "temp_audio.wav" |
|
|
|
if isinstance(audio_data, tuple): |
|
audio_array, sample_rate = audio_data |
|
sf.write(wav_file, audio_array, sample_rate) |
|
elif isinstance(audio_data, str): |
|
audio_array, sample_rate = librosa.load(audio_data, sr=16000) |
|
sf.write(wav_file, audio_array, sample_rate) |
|
else: |
|
return "Ошибка: неизвестный формат аудиоданных", "\n".join(log_messages) |
|
|
|
log_messages.append(f"Загрузка файла завершена за {time.time() - start_time:.2f} сек") |
|
|
|
start_time = time.time() |
|
log_messages.append("Работа модели... в процессе") |
|
result = pipe(wav_file) |
|
log_messages.append(f"Работа модели завершена за {time.time() - start_time:.2f} сек") |
|
|
|
return result["text"], "\n".join(log_messages) |
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("## Распознавание речи с Whisper") |
|
|
|
audio_data = gr.Audio(type="filepath") |
|
text_output = gr.Textbox(label="Распознанный текст") |
|
log_output = gr.Textbox(label="Лог выполнения", interactive=False) |
|
|
|
btn = gr.Button("Распознать") |
|
btn.click(transcribe, inputs=audio_data, outputs=[text_output, log_output]) |
|
|
|
app.launch(debug=True) |
|
|