|
import gradio as gr |
|
import speech_recognition as sr |
|
|
|
input_hijack = { |
|
'state': False, |
|
'value': ["", ""] |
|
} |
|
|
|
|
|
def do_stt(audio, text_state=""): |
|
transcription = "" |
|
r = sr.Recognizer() |
|
|
|
|
|
audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4) |
|
|
|
try: |
|
transcription = r.recognize_whisper(audio_data, language="english", model="base.en") |
|
except sr.UnknownValueError: |
|
print("Whisper could not understand audio") |
|
except sr.RequestError as e: |
|
print("Could not request results from Whisper", e) |
|
|
|
input_hijack.update({"state": True, "value": [transcription, transcription]}) |
|
|
|
text_state += transcription + " " |
|
return text_state, text_state |
|
|
|
|
|
def update_hijack(val): |
|
input_hijack.update({"state": True, "value": [val, val]}) |
|
return val |
|
|
|
|
|
def auto_transcribe(audio, audio_auto, text_state=""): |
|
if audio is None: |
|
return "", "" |
|
if audio_auto: |
|
return do_stt(audio, text_state) |
|
return "", "" |
|
|
|
|
|
def ui(): |
|
tr_state = gr.State(value="") |
|
output_transcription = gr.Textbox(label="STT-Input", |
|
placeholder="Speech Preview. Click \"Generate\" to send", |
|
interactive=True) |
|
output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state]) |
|
audio_auto = gr.Checkbox(label="Auto-Transcribe", value=True) |
|
with gr.Row(): |
|
audio = gr.Audio(source="microphone") |
|
audio.change(fn=auto_transcribe, inputs=[audio, audio_auto, tr_state], outputs=[output_transcription, tr_state]) |
|
transcribe_button = gr.Button(value="Transcribe") |
|
transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state]) |
|
|