Spaces:

datasciencedojo
/

AudioTranscription

Running

File size: 3,136 Bytes

13b9ce8
 
 
 
60c7e9e
13b9ce8
 
 
 
 
 
 
 
 
 
 
 
 
 
60c7e9e
13b9ce8
 
60c7e9e
13b9ce8
 
60c7e9e
 
13b9ce8
60c7e9e
13b9ce8
 
 
60c7e9e
 
 
 
 
 
 
 
 
 
 
 
13b9ce8
2a44a58
 
 
591b040
2a44a58
 
 
 
 
 
 
 
5ffdb12
2a44a58
 
 
591b040
2a44a58
 
 
 
0d652f8
2a44a58
 
 
5ffdb12
2a44a58
 
 
a8d18b4
 
2a44a58
 
 
 
60c7e9e
 
 
 
bdb71a5
 
60c7e9e
 
 
 
13b9ce8
60c7e9e
 
13b9ce8
60c7e9e

import torch
import gradio as gr
from transformers import pipeline

MODEL_NAME = "openai/whisper-small"  # this always needs to stay in line 8 :D sorry for the hackiness
lang = "en"

device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

def transcribe(microphone, file_upload):
    warn_output = ""
    if microphone and file_upload:
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used, and the uploaded audio will be discarded.\n"
        )

    elif not (microphone or file_upload):
        return "ERROR: You have to either use the microphone or upload an audio file."

    file = microphone if microphone else file_upload
    text = pipe(file)["text"]
    return warn_output + text

examples = [
    ['Martin Luther king - FREE AT LAST.mp3'], 
    ['Winston Churchul - ARCH OF VICTOR.mp3'], 
    ['Voice of Neil Armstrong.mp3'], 
    ['Speeh by George Washington.mp3'], 
    ['Speech by John Kennedy.mp3'], 
    ['Al Gore on Inventing the Internet.mp3'], 
    ['Alan Greenspan.mp3'], 
    ['Neil Armstrong - ONE SMALL STEP.mp3'], 
    ['General Eisenhower announcing D-Day landing.mp3'], 
    ['Hey Siri.wav']
]

css = """
footer {display:none !important}
.output-markdown{display:none !important}
button.primary {
    z-index: 14;
    left: 0px;
    top: 0px;
    cursor: pointer !important; 
    background: none rgb(17, 20, 45) !important;
    border: none !important;
    color: rgb(255, 255, 255) !important;
    line-height: 1 !important;
    border-radius: 6px !important;
    transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
    box-shadow: none !important;
}
button.primary:hover{
    z-index: 14;
    left: 0px;
    top: 0px;
    cursor: pointer !important;
    background: none rgb(66, 133, 244) !important;
    border: none !important;
    color: rgb(255, 255, 255) !important;
    line-height: 1 !important;
    border-radius: 6px !important;
    transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
    box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important;
}
button.gallery-item:hover {
    border-color: rgb(37 56 133) !important;
    background-color: rgb(229,225,255) !important;
}
"""

with gr.Blocks(css=css) as demo:
    with gr.Row():
        gr.Markdown("## Speech Recognition Demo")
    with gr.Row():
        mic_input = gr.Audio(label="Microphone Input", interactive=True, type="filepath")
        file_upload = gr.Audio(label="File Upload", interactive=True, type="filepath")
    with gr.Row():
        output = gr.Textbox(label="Transcription Output")
    with gr.Row():
        gr.Examples(examples=examples, inputs=[file_upload], label="Examples")

    transcribe_button = gr.Button("Transcribe")
    transcribe_button.click(transcribe, inputs=[mic_input, file_upload], outputs=[output])

demo.launch()