Spaces:
Paused
Paused
File size: 7,304 Bytes
226475c 5307e6b 0041ae6 dbc99da 5307e6b 226475c 374bd91 5307e6b dbc99da cca84a3 dbc99da 8045111 dbc99da 226475c 0041ae6 cca84a3 dbc99da cca84a3 dbc99da cca84a3 226475c 08a0218 226475c 08a0218 838d593 8045111 0035001 0bc8a9a cca84a3 8045111 282149c cca84a3 0bc8a9a a2d9db4 226475c b9359f0 0041ae6 b9359f0 cca84a3 b9359f0 9e2b006 b9359f0 9e2b006 b9359f0 0041ae6 b9359f0 cca84a3 b9359f0 9e2b006 226475c b9359f0 226475c cca84a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate
def translate(audio_file):
audio, sampling_rate = sf.read(audio_file)
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
label_outputs = {}
for pred in language_prediction:
label_outputs[pred["label"]] = pred["score"]
return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
inputs = processor(text_prompt, voice_preset=voice_preset)
speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
translated_text, label_outputs= translate(audio)
synthesised_speech = synthesise(translated_text,voice_preset)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
title = "外国话转普通话"
description = """
作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成普通话语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。

"""
examples = [
# ["./en.mp3", None],
# ["./de.mp3", None],
["./fr.mp3", None],
["./it.mp3", None],
["./nl.mp3", None],
["./fi.mp3", None],
# ["./cs.mp3", None],
# ["./pl.mp3", None],
]
import gradio as gr
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
examples=examples,
)
mic_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
)
with demo:
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["Transcribe Audio File", "Transcribe Microphone"],
)
demo.launch()
###########################################################################################################################
# import torch
# import numpy as np
# import soundfile as sf
# from transformers import pipeline
# from transformers import BarkModel
# from transformers import AutoProcessor
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# pipe = pipeline(
# "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
# )
# #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
# processor = AutoProcessor.from_pretrained("suno/bark")
# model = BarkModel.from_pretrained("suno/bark")
# model = model.to(device)
# synthesised_rate = model.generation_config.sample_rate
# def translate(audio_file):
# # audio, sampling_rate = sf.read(audio_file)
# outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
# # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
# # label_outputs = {}
# # for pred in language_prediction:
# # label_outputs[pred["label"]] = pred["score"]
# return outputs["text"]#,label_outputs
# def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
# inputs = processor(text_prompt, voice_preset=voice_preset)
# speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
# return speech_output
# def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
# #translated_text, label_outputs= translate(audio)
# translated_text = translate(audio)
# synthesised_speech = synthesise(translated_text,voice_preset)
# synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
# return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
# title = "外国话转中文话"
# description = """
# 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话(CPU演示太慢暂时先去掉了),一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
# 
# """
# examples = [
# ["./en.mp3", None],
# ["./de.mp3", None],
# ["./fr.mp3", None],
# ["./it.mp3", None],
# ["./nl.mp3", None],
# ["./fi.mp3", None],
# ["./cs.mp3", None],
# ["./pl.mp3", None],
# ]
# import gradio as gr
# demo = gr.Blocks()
# file_transcribe = gr.Interface(
# fn=speech_to_speech_translation,
# inputs=gr.Audio(source="upload", type="filepath"),
# outputs=[
# gr.Audio(label="Generated Speech", type="numpy"),
# gr.Text(label="Transcription"),
# # gr.Label(label="Language prediction"),
# ],
# title=title,
# description=description,
# examples=examples,
# )
# mic_transcribe = gr.Interface(
# fn=speech_to_speech_translation,
# inputs=gr.Audio(source="microphone", type="filepath"),
# outputs=[
# gr.Audio(label="Generated Speech", type="numpy"),
# gr.Text(label="Transcription"),
# # gr.Label(label="Language prediction"),
# ],
# title=title,
# description=description,
# )
# with demo:
# gr.TabbedInterface(
# [file_transcribe, mic_transcribe],
# ["Transcribe Audio File", "Transcribe Microphone"],
# )
# demo.launch() |