import gradio as gr
import torch
import numpy as np
import soundfile as sf
import librosa
import time
from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition",
    model="antony66/whisper-large-v3-russian",
    torch_dtype=torch.float16,
    device=0 if torch.cuda.is_available() else -1
)

def transcribe(audio_data):
    log_messages = []
    
    start_time = time.time()
    log_messages.append("Загрузка файла...")
    
    if audio_data is None:
        return "Ошибка: не получены аудиоданные", "\n".join(log_messages)
    
    wav_file = "temp_audio.wav"
    
    if isinstance(audio_data, tuple):
        audio_array, sample_rate = audio_data
        sf.write(wav_file, audio_array, sample_rate)
    elif isinstance(audio_data, str):
        audio_array, sample_rate = librosa.load(audio_data, sr=16000)
        sf.write(wav_file, audio_array, sample_rate)
    else:
        return "Ошибка: неизвестный формат аудиоданных", "\n".join(log_messages)
    
    log_messages.append(f"Загрузка файла завершена за {time.time() - start_time:.2f} сек")
    
    start_time = time.time()
    log_messages.append("Работа модели... в процессе")
    result = pipe(wav_file)
    log_messages.append(f"Работа модели завершена за {time.time() - start_time:.2f} сек")
    
    return result["text"], "\n".join(log_messages)

with gr.Blocks() as app:
    gr.Markdown("## Распознавание речи с Whisper")
    
    audio_data = gr.Audio(type="filepath")
    text_output = gr.Textbox(label="Распознанный текст")
    log_output = gr.Textbox(label="Лог выполнения", interactive=False)
    
    btn = gr.Button("Распознать")
    btn.click(transcribe, inputs=audio_data, outputs=[text_output, log_output])

app.launch(debug=True)