File size: 3,136 Bytes
13b9ce8 60c7e9e 13b9ce8 60c7e9e 13b9ce8 60c7e9e 13b9ce8 60c7e9e 13b9ce8 60c7e9e 13b9ce8 60c7e9e 13b9ce8 2a44a58 591b040 2a44a58 5ffdb12 2a44a58 591b040 2a44a58 0d652f8 2a44a58 5ffdb12 2a44a58 a8d18b4 2a44a58 60c7e9e bdb71a5 60c7e9e 13b9ce8 60c7e9e 13b9ce8 60c7e9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import torch
import gradio as gr
from transformers import pipeline
MODEL_NAME = "openai/whisper-small" # this always needs to stay in line 8 :D sorry for the hackiness
lang = "en"
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
def transcribe(microphone, file_upload):
warn_output = ""
if microphone and file_upload:
warn_output = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used, and the uploaded audio will be discarded.\n"
)
elif not (microphone or file_upload):
return "ERROR: You have to either use the microphone or upload an audio file."
file = microphone if microphone else file_upload
text = pipe(file)["text"]
return warn_output + text
examples = [
['Martin Luther king - FREE AT LAST.mp3'],
['Winston Churchul - ARCH OF VICTOR.mp3'],
['Voice of Neil Armstrong.mp3'],
['Speeh by George Washington.mp3'],
['Speech by John Kennedy.mp3'],
['Al Gore on Inventing the Internet.mp3'],
['Alan Greenspan.mp3'],
['Neil Armstrong - ONE SMALL STEP.mp3'],
['General Eisenhower announcing D-Day landing.mp3'],
['Hey Siri.wav']
]
css = """
footer {display:none !important}
.output-markdown{display:none !important}
button.primary {
z-index: 14;
left: 0px;
top: 0px;
cursor: pointer !important;
background: none rgb(17, 20, 45) !important;
border: none !important;
color: rgb(255, 255, 255) !important;
line-height: 1 !important;
border-radius: 6px !important;
transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
box-shadow: none !important;
}
button.primary:hover{
z-index: 14;
left: 0px;
top: 0px;
cursor: pointer !important;
background: none rgb(66, 133, 244) !important;
border: none !important;
color: rgb(255, 255, 255) !important;
line-height: 1 !important;
border-radius: 6px !important;
transition: box-shadow 200ms ease 0s, background 200ms ease 0s !important;
box-shadow: rgb(0 0 0 / 23%) 0px 1px 7px 0px !important;
}
button.gallery-item:hover {
border-color: rgb(37 56 133) !important;
background-color: rgb(229,225,255) !important;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Row():
gr.Markdown("## Speech Recognition Demo")
with gr.Row():
mic_input = gr.Audio(label="Microphone Input", interactive=True, type="filepath")
file_upload = gr.Audio(label="File Upload", interactive=True, type="filepath")
with gr.Row():
output = gr.Textbox(label="Transcription Output")
with gr.Row():
gr.Examples(examples=examples, inputs=[file_upload], label="Examples")
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(transcribe, inputs=[mic_input, file_upload], outputs=[output])
demo.launch()
|