Spaces:

kaysrubio
/

speech_transcribe_phonemes_and_accent

Running

File size: 5,395 Bytes

import gradio as gr
import os
import subprocess

try:
    from transformers import pipeline
except ModuleNotFoundError:
    print("Installing transformers...")
    subprocess.check_call(["pip", "install", "transformers"])
    from transformers import pipeline  # Retry import

import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier

# Set up pipe for whisper asr
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base.en",
    torch_dtype=torch.float32,
    device="cpu",
)

# Set up pipe for 2 phonemic transcription models
american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")

# Set up pipe for 2 accent classification models
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")

def native_accent_classifier(file):
  out_prob, score, index, text_lab = classifier.classify_file(file)
  rounded_score = round(score.item(), 2)
  return [{'accent': text_lab[0], 'score': rounded_score}]

def esl_accent_classifier(file):
  esl_accent_pipe = pipeline(
    "audio-classification",
    model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
  )
  audio, sr = torchaudio.load(file)  # Load audio
  audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
  audio = audio.squeeze().numpy()
  result = esl_accent_pipe(audio, top_k=6)
  return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]

def transcribe_and_classify_speech(file):
  try:
      asr_output = asr_pipe(
        file,
        max_new_tokens=256,
        chunk_length_s=30,
        batch_size=8,
      )["text"]
  except Exception as e:
    print(f"An error occurred with openai/whisper-base.en: {e}")
    asr_output = "Error, make sure your file is in mono format"

  try:
    american_phoneme_output = american_phoneme_pipe(file)['text']
  except Exception as e:
    print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
    american_phoneme_output = "Error, make sure your file is in mono format"

  try:
    esl_phoneme_output = esl_phoneme_pipe(file)['text']
  except Exception as e:
    print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
    esl_phoneme_output = "Error"

  try:
    native_accent_output = native_accent_classifier(file)
  except Exception as e:
    print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}")
    native_accent_output = [{'accent': "Error"}, {'score': .0}]

  try:
    esl_accent_output = esl_accent_classifier(file)
  except Exception as e:
    print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}")
    esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}]

  output = [
      {'transcription': asr_output},
      {'phonemes_native_eng': american_phoneme_output},
      {'phonemes_eng_second_lang': esl_phoneme_output},
      {'native_eng_country': native_accent_output},
      {'first_lang_if_not_eng': esl_accent_output}
  ]
  return output

## Set up gradio app
demo = gr.Blocks()

examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']]

# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
    with gr.Blocks() as interface:
        gr.Markdown("""
        Use microphone, upload .wav file, or choose an example below. Output will include results from the following models: 
          - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co./openai/whisper-base.en)
          - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co./vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
          - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co./mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
          - Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co./Jzuluaga/accent-id-commonaccent_ecapa)
          - Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co./kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2)
        """)
        with gr.Column():
            audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
            output = gr.JSON(label="Results")
        audio_input.change(fn=transcribe_and_classify_speech, inputs=audio_input, outputs=output)
        gr.Examples(examples=examples, inputs=[audio_input])
    return interface

# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")

demo = gr.TabbedInterface(
    [mic_transcribe, file_transcribe],
    ["Microphone Input", "Upload .wav file"],
    title="Speech Recognition and Accent Classification",
)

demo.launch()
# demo.launch(debug=True)