Spaces:
Paused
Paused
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import os | |
from huggingface_hub import snapshot_download | |
from models.vc.vevo.vevo_utils import * | |
def vevo_tts( | |
src_text, | |
ref_wav_path, | |
timbre_ref_wav_path=None, | |
output_path=None, | |
ref_text=None, | |
src_language="en", | |
ref_language="en", | |
): | |
if timbre_ref_wav_path is None: | |
timbre_ref_wav_path = ref_wav_path | |
gen_audio = inference_pipeline.inference_ar_and_fm( | |
src_wav_path=None, | |
src_text=src_text, | |
style_ref_wav_path=ref_wav_path, | |
timbre_ref_wav_path=timbre_ref_wav_path, | |
style_ref_wav_text=ref_text, | |
src_text_language=src_language, | |
style_ref_wav_text_language=ref_language, | |
) | |
assert output_path is not None | |
save_audio(gen_audio, output_path=output_path) | |
if __name__ == "__main__": | |
# ===== Device ===== | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
# ===== Content-Style Tokenizer ===== | |
local_dir = snapshot_download( | |
repo_id="amphion/Vevo", | |
repo_type="model", | |
cache_dir="./ckpts/Vevo", | |
allow_patterns=["tokenizer/vq8192/*"], | |
) | |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192") | |
# ===== Autoregressive Transformer ===== | |
local_dir = snapshot_download( | |
repo_id="amphion/Vevo", | |
repo_type="model", | |
cache_dir="./ckpts/Vevo", | |
allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"], | |
) | |
ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json" | |
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192") | |
# ===== Flow Matching Transformer ===== | |
local_dir = snapshot_download( | |
repo_id="amphion/Vevo", | |
repo_type="model", | |
cache_dir="./ckpts/Vevo", | |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"], | |
) | |
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json" | |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels") | |
# ===== Vocoder ===== | |
local_dir = snapshot_download( | |
repo_id="amphion/Vevo", | |
repo_type="model", | |
cache_dir="./ckpts/Vevo", | |
allow_patterns=["acoustic_modeling/Vocoder/*"], | |
) | |
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json" | |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder") | |
# ===== Inference ===== | |
inference_pipeline = VevoInferencePipeline( | |
content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path, | |
ar_cfg_path=ar_cfg_path, | |
ar_ckpt_path=ar_ckpt_path, | |
fmt_cfg_path=fmt_cfg_path, | |
fmt_ckpt_path=fmt_ckpt_path, | |
vocoder_cfg_path=vocoder_cfg_path, | |
vocoder_ckpt_path=vocoder_ckpt_path, | |
device=device, | |
) | |
src_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences." | |
ref_wav_path = "./models/vc/vevo/wav/arabic_male.wav" | |
ref_text = "Flip stood undecided, his ears strained to catch the slightest sound." | |
# 1. Zero-Shot TTS (the style reference and timbre reference are same) | |
vevo_tts( | |
src_text, | |
ref_wav_path, | |
output_path="./models/vc/vevo/wav/output_vevotts1.wav", | |
ref_text=ref_text, | |
src_language="en", | |
ref_language="en", | |
) | |
# 2. Style and Timbre Controllable Zero-Shot TTS (the style reference and timbre reference are different) | |
vevo_tts( | |
src_text, | |
ref_wav_path, | |
timbre_ref_wav_path="./models/vc/vevo/wav/mandarin_female.wav", | |
output_path="./models/vc/vevo/wav/output_vevotts2.wav", | |
ref_text=ref_text, | |
src_language="en", | |
ref_language="en", | |
) | |