Spaces:

naonauno
/

dialogs2-factory

Paused

App Files Files Community

dialogs2-factory / Amphion /models /vc /vevo /infer_vevotts.py

naonauno

Upload 855 files

d66c48f verified 4 months ago

raw

history blame contribute delete

4.11 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import os
	from huggingface_hub import snapshot_download

	from models.vc.vevo.vevo_utils import *


	def vevo_tts(
	src_text,
	ref_wav_path,
	timbre_ref_wav_path=None,
	output_path=None,
	ref_text=None,
	src_language="en",
	ref_language="en",
	):
	if timbre_ref_wav_path is None:
	timbre_ref_wav_path = ref_wav_path

	gen_audio = inference_pipeline.inference_ar_and_fm(
	src_wav_path=None,
	src_text=src_text,
	style_ref_wav_path=ref_wav_path,
	timbre_ref_wav_path=timbre_ref_wav_path,
	style_ref_wav_text=ref_text,
	src_text_language=src_language,
	style_ref_wav_text_language=ref_language,
	)

	assert output_path is not None
	save_audio(gen_audio, output_path=output_path)


	if __name__ == "__main__":
	# ===== Device =====
	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

	# ===== Content-Style Tokenizer =====
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["tokenizer/vq8192/*"],
	)

	content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

	# ===== Autoregressive Transformer =====
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
	)

	ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
	ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")

	# ===== Flow Matching Transformer =====
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
	)

	fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
	fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

	# ===== Vocoder =====
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vocoder/*"],
	)

	vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
	vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

	# ===== Inference =====
	inference_pipeline = VevoInferencePipeline(
	content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
	ar_cfg_path=ar_cfg_path,
	ar_ckpt_path=ar_ckpt_path,
	fmt_cfg_path=fmt_cfg_path,
	fmt_ckpt_path=fmt_ckpt_path,
	vocoder_cfg_path=vocoder_cfg_path,
	vocoder_ckpt_path=vocoder_ckpt_path,
	device=device,
	)

	src_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."

	ref_wav_path = "./models/vc/vevo/wav/arabic_male.wav"
	ref_text = "Flip stood undecided, his ears strained to catch the slightest sound."

	# 1. Zero-Shot TTS (the style reference and timbre reference are same)
	vevo_tts(
	src_text,
	ref_wav_path,
	output_path="./models/vc/vevo/wav/output_vevotts1.wav",
	ref_text=ref_text,
	src_language="en",
	ref_language="en",
	)

	# 2. Style and Timbre Controllable Zero-Shot TTS (the style reference and timbre reference are different)
	vevo_tts(
	src_text,
	ref_wav_path,
	timbre_ref_wav_path="./models/vc/vevo/wav/mandarin_female.wav",
	output_path="./models/vc/vevo/wav/output_vevotts2.wav",
	ref_text=ref_text,
	src_language="en",
	ref_language="en",
	)