speech-to-speech

Paused

File size: 7,304 Bytes

226475c
5307e6b
0041ae6
dbc99da
5307e6b
 
226475c
374bd91
5307e6b
dbc99da
 
 
cca84a3
dbc99da
8045111
dbc99da
 
226475c
0041ae6
cca84a3
 
 
 
 
 
 
dbc99da
 
 
 
 
cca84a3
dbc99da
 
cca84a3
226475c
08a0218
226475c
08a0218
838d593
8045111
 
0035001
0bc8a9a
cca84a3
 
8045111
 
282149c
 
cca84a3
 
0bc8a9a
a2d9db4
226475c
b9359f0
 
0041ae6
b9359f0
 
 
 
cca84a3
b9359f0
9e2b006
 
b9359f0
9e2b006
b9359f0
0041ae6
b9359f0
 
 
 
cca84a3
b9359f0
9e2b006
 
 
226475c
b9359f0
 
 
 
226475c
cca84a3

import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio_file):
    audio, sampling_rate = sf.read(audio_file)
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
    label_outputs = {}
    for pred in language_prediction:
        label_outputs[pred["label"]] = pred["score"]
    return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text, label_outputs= translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs

title = "外国话转普通话"
description = """
作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成普通话语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。

![Cascaded STST](https://huggingface.co./datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

examples = [
    # ["./en.mp3", None],
    # ["./de.mp3", None],
    ["./fr.mp3", None],
    ["./it.mp3", None],
    ["./nl.mp3", None],
    ["./fi.mp3", None],
    # ["./cs.mp3", None],
    # ["./pl.mp3", None],    
]
import gradio as gr

demo = gr.Blocks()
file_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
    examples=examples,
)
mic_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
)
with demo:
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["Transcribe Audio File", "Transcribe Microphone"],
    )

demo.launch()
###########################################################################################################################
# import torch
# import numpy as np
# import soundfile as sf
# from transformers import pipeline
# from transformers import BarkModel
# from transformers import AutoProcessor

# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# pipe = pipeline(
#     "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
# )
# #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
# processor = AutoProcessor.from_pretrained("suno/bark")
# model = BarkModel.from_pretrained("suno/bark")
# model = model.to(device)
# synthesised_rate = model.generation_config.sample_rate

# def translate(audio_file):
# #    audio, sampling_rate = sf.read(audio_file)
#     outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
# #    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
# #    label_outputs = {}
# #    for pred in language_prediction:
# #        label_outputs[pred["label"]] = pred["score"]
#     return outputs["text"]#,label_outputs
# def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
#     inputs = processor(text_prompt, voice_preset=voice_preset)
#     speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
#     return speech_output
# def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
#     #translated_text, label_outputs= translate(audio)
#     translated_text = translate(audio)
#     synthesised_speech = synthesise(translated_text,voice_preset)
#     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
#     return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs

# title = "外国话转中文话"
# description = """
# 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话（CPU演示太慢暂时先去掉了），一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。

# ![Cascaded STST](https://huggingface.co./datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
# """

# examples = [
#     ["./en.mp3", None],
#     ["./de.mp3", None],
#     ["./fr.mp3", None],
#     ["./it.mp3", None],
#     ["./nl.mp3", None],
#     ["./fi.mp3", None],
#     ["./cs.mp3", None],
#     ["./pl.mp3", None],    
# ]
# import gradio as gr

# demo = gr.Blocks()
# file_transcribe = gr.Interface(
#     fn=speech_to_speech_translation,
#     inputs=gr.Audio(source="upload", type="filepath"),
#     outputs=[
#         gr.Audio(label="Generated Speech", type="numpy"),
#         gr.Text(label="Transcription"),
# #        gr.Label(label="Language prediction"),
#     ],
#     title=title,
#     description=description,
#     examples=examples,
# )
# mic_transcribe = gr.Interface(
#     fn=speech_to_speech_translation,
#     inputs=gr.Audio(source="microphone", type="filepath"),
#     outputs=[
#         gr.Audio(label="Generated Speech", type="numpy"),
#         gr.Text(label="Transcription"),
# #        gr.Label(label="Language prediction"),
#     ],
#     title=title,
#     description=description,
# )
# with demo:
#     gr.TabbedInterface(
#         [file_transcribe, mic_transcribe],
#         ["Transcribe Audio File", "Transcribe Microphone"],
#     )

# demo.launch()