File size: 5,395 Bytes
0588955
7df1980
 
 
 
 
 
 
 
 
 
19a2ea8
 
 
0588955
19a2ea8
 
 
 
 
 
 
0588955
19a2ea8
 
 
 
 
 
 
f847932
 
 
 
19a2ea8
f847932
19a2ea8
 
 
 
f847932
 
19a2ea8
 
 
 
4855128
19a2ea8
 
4855128
19a2ea8
 
 
 
 
 
 
 
 
4855128
19a2ea8
 
 
 
 
4855128
19a2ea8
 
 
 
 
f847932
19a2ea8
 
f847932
19a2ea8
 
f847932
19a2ea8
 
 
 
 
 
 
 
 
 
 
 
 
d40ccca
19a2ea8
 
d40ccca
 
 
 
 
 
4855128
d40ccca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19a2ea8
d40ccca
f847932
d40ccca
19a2ea8
f847932
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import os
import subprocess

try:
    from transformers import pipeline
except ModuleNotFoundError:
    print("Installing transformers...")
    subprocess.check_call(["pip", "install", "transformers"])
    from transformers import pipeline  # Retry import

import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier

# Set up pipe for whisper asr
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base.en",
    torch_dtype=torch.float32,
    device="cpu",
)

# Set up pipe for 2 phonemic transcription models
american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")

# Set up pipe for 2 accent classification models
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")

def native_accent_classifier(file):
  out_prob, score, index, text_lab = classifier.classify_file(file)
  rounded_score = round(score.item(), 2)
  return [{'accent': text_lab[0], 'score': rounded_score}]

def esl_accent_classifier(file):
  esl_accent_pipe = pipeline(
    "audio-classification",
    model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
  )
  audio, sr = torchaudio.load(file)  # Load audio
  audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
  audio = audio.squeeze().numpy()
  result = esl_accent_pipe(audio, top_k=6)
  return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]

def transcribe_and_classify_speech(file):
  try:
      asr_output = asr_pipe(
        file,
        max_new_tokens=256,
        chunk_length_s=30,
        batch_size=8,
      )["text"]
  except Exception as e:
    print(f"An error occurred with openai/whisper-base.en: {e}")
    asr_output = "Error, make sure your file is in mono format"

  try:
    american_phoneme_output = american_phoneme_pipe(file)['text']
  except Exception as e:
    print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
    american_phoneme_output = "Error, make sure your file is in mono format"

  try:
    esl_phoneme_output = esl_phoneme_pipe(file)['text']
  except Exception as e:
    print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
    esl_phoneme_output = "Error"

  try:
    native_accent_output = native_accent_classifier(file)
  except Exception as e:
    print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}")
    native_accent_output = [{'accent': "Error"}, {'score': .0}]

  try:
    esl_accent_output = esl_accent_classifier(file)
  except Exception as e:
    print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}")
    esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}]

  output = [
      {'transcription': asr_output},
      {'phonemes_native_eng': american_phoneme_output},
      {'phonemes_eng_second_lang': esl_phoneme_output},
      {'native_eng_country': native_accent_output},
      {'first_lang_if_not_eng': esl_accent_output}
  ]
  return output

## Set up gradio app
demo = gr.Blocks()

examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']]

# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
    with gr.Blocks() as interface:
        gr.Markdown("""
        Use microphone, upload .wav file, or choose an example below. Output will include results from the following models: 
          - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co./openai/whisper-base.en)
          - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co./vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
          - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co./mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
          - Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co./Jzuluaga/accent-id-commonaccent_ecapa)
          - Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co./kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2)
        """)
        with gr.Column():
            audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
            output = gr.JSON(label="Results")
        audio_input.change(fn=transcribe_and_classify_speech, inputs=audio_input, outputs=output)
        gr.Examples(examples=examples, inputs=[audio_input])
    return interface

# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")

demo = gr.TabbedInterface(
    [mic_transcribe, file_transcribe],
    ["Microphone Input", "Upload .wav file"],
    title="Speech Recognition and Accent Classification",
)

demo.launch()
# demo.launch(debug=True)