File size: 5,395 Bytes
0588955 7df1980 19a2ea8 0588955 19a2ea8 0588955 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 4855128 19a2ea8 4855128 19a2ea8 4855128 19a2ea8 4855128 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 d40ccca 19a2ea8 d40ccca 4855128 d40ccca 19a2ea8 d40ccca f847932 d40ccca 19a2ea8 f847932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import os
import subprocess
try:
from transformers import pipeline
except ModuleNotFoundError:
print("Installing transformers...")
subprocess.check_call(["pip", "install", "transformers"])
from transformers import pipeline # Retry import
import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier
# Set up pipe for whisper asr
asr_pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base.en",
torch_dtype=torch.float32,
device="cpu",
)
# Set up pipe for 2 phonemic transcription models
american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")
# Set up pipe for 2 accent classification models
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
def native_accent_classifier(file):
out_prob, score, index, text_lab = classifier.classify_file(file)
rounded_score = round(score.item(), 2)
return [{'accent': text_lab[0], 'score': rounded_score}]
def esl_accent_classifier(file):
esl_accent_pipe = pipeline(
"audio-classification",
model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
)
audio, sr = torchaudio.load(file) # Load audio
audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
audio = audio.squeeze().numpy()
result = esl_accent_pipe(audio, top_k=6)
return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
def transcribe_and_classify_speech(file):
try:
asr_output = asr_pipe(
file,
max_new_tokens=256,
chunk_length_s=30,
batch_size=8,
)["text"]
except Exception as e:
print(f"An error occurred with openai/whisper-base.en: {e}")
asr_output = "Error, make sure your file is in mono format"
try:
american_phoneme_output = american_phoneme_pipe(file)['text']
except Exception as e:
print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
american_phoneme_output = "Error, make sure your file is in mono format"
try:
esl_phoneme_output = esl_phoneme_pipe(file)['text']
except Exception as e:
print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
esl_phoneme_output = "Error"
try:
native_accent_output = native_accent_classifier(file)
except Exception as e:
print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}")
native_accent_output = [{'accent': "Error"}, {'score': .0}]
try:
esl_accent_output = esl_accent_classifier(file)
except Exception as e:
print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}")
esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}]
output = [
{'transcription': asr_output},
{'phonemes_native_eng': american_phoneme_output},
{'phonemes_eng_second_lang': esl_phoneme_output},
{'native_eng_country': native_accent_output},
{'first_lang_if_not_eng': esl_accent_output}
]
return output
## Set up gradio app
demo = gr.Blocks()
examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']]
# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
with gr.Blocks() as interface:
gr.Markdown("""
Use microphone, upload .wav file, or choose an example below. Output will include results from the following models:
- Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co./openai/whisper-base.en)
- Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co./vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
- Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co./mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
- Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co./Jzuluaga/accent-id-commonaccent_ecapa)
- Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co./kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2)
""")
with gr.Column():
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
output = gr.JSON(label="Results")
audio_input.change(fn=transcribe_and_classify_speech, inputs=audio_input, outputs=output)
gr.Examples(examples=examples, inputs=[audio_input])
return interface
# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")
demo = gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Microphone Input", "Upload .wav file"],
title="Speech Recognition and Accent Classification",
)
demo.launch()
# demo.launch(debug=True) |