StyleTTS2-lite-vi-space

Running

App Files Files Community

StyleTTS2-lite-vi-space / app.py

dangtr0408

update extension

2c9c807 18 days ago

raw

history blame contribute delete

5.1 kB

	import gradio as gr
	import subprocess
	import os
	import sys
	import soundfile as sf
	import numpy as np
	import torch
	import traceback
	import spaces

	repo_url = "https://huggingface.co./dangtr0408/StyleTTS2-lite-vi"
	repo_dir = "StyleTTS2-lite-vi"
	if not os.path.exists(repo_dir):
	subprocess.run(["git", "clone", repo_url, repo_dir])
	sys.path.append(os.path.abspath(repo_dir))
	from inference import StyleTTS2

	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	config_path = os.path.join(repo_dir, "Models", "config.yaml")
	models_path = os.path.join(repo_dir, "Models", "model.pth")
	model = StyleTTS2(config_path, models_path).eval().to(device)
	voice_path = os.path.join(repo_dir, "reference_audio")
	eg_voices = [os.path.join(voice_path,"vn_1.wav"), os.path.join(voice_path,"vn_2.wav")]
	eg_texts = [
	"Chỉ với khoảng 90 triệu tham số, [en-us]{StyleTTS2-lite} có thể dễ dàng tạo giọng nói với tốc độ cao.",
	"[id_1] Với [en-us]{StyleTTS2-lite} bạn có thể sử dụng [en-us]{language tag} để mô hình chắc chắn đọc bằng tiếng Anh, [id_2]cũng như sử dụng [en-us]{speaker tag} để chuyển đổi nhanh giữa các giọng đọc.",
	]


	# Core inference function
	@spaces.GPU
	def main(reference_paths, text_prompt, denoise, avg_style, stabilize):
	try:
	speakers = {}
	for i, path in enumerate(reference_paths, 1):
	speaker_id = f"id_{i}"
	speakers[speaker_id] = {
	"path": path,
	"lang": "vi",
	"speed": 1.0
	}

	with torch.no_grad():
	styles = model.get_styles(speakers, denoise, avg_style)
	r = model.generate(text_prompt, styles, stabilize, 18, "[id_1]")
	r = r / np.abs(r).max()

	sf.write("output.wav", r, samplerate=24000)
	return "output.wav", "Audio generated successfully!"

	except Exception as e:
	error_message = traceback.format_exc()
	return None, error_message

	def on_file_upload(file_list):
	if not file_list:
	return None, "No file uploaded yet."

	unique_files = {}
	for file_path in file_list:
	file_name = os.path.basename(file_path)
	unique_files[file_name] = file_path #update and remove duplicate

	uploaded_infos = []
	uploaded_file_names = list(unique_files.keys())
	for i in range(len(uploaded_file_names)):
	uploaded_infos.append(f"[id_{i+1}]: {uploaded_file_names[i]}")

	summary = "\n".join(uploaded_infos)
	return list(unique_files.values()), f"Current reference audios:\n{summary}"

	def gen_example(reference_paths, text_prompt):
	output, status = main(reference_paths, text_prompt, 0.6, True, True)
	return output, reference_paths, status


	# Gradio UI
	with gr.Blocks() as demo:
	gr.HTML("<h1 style='text-align: center;'>StyleTTS2‑Lite Demo</h1>")
	gr.Markdown(
	"Download the local inference package from Hugging Face: "
	"[StyleTTS2‑Lite (Vietnamese)]"
	"(https://huggingface.co./dangtr0408/StyleTTS2-lite-vi/)."
	)
	gr.Markdown(
	"Annotate any non‑Vietnamese words with the appropriate language tag, e.g., [en-us]{ } for English. For more information, see "
	"[eSpeakNG docs]"
	"(https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)"
	)

	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	text_prompt = gr.Textbox(label="Text Prompt", placeholder="Enter your text here...", lines=4)
	with gr.Column(scale=1):
	avg_style = gr.Checkbox(label="Use Average Styles", value=True)
	stabilize = gr.Checkbox(label="Stabilize Speaking Speed", value=True)
	denoise = gr.Slider(0.0, 1.0, step=0.1, value=0.6, label="Denoise Strength")

	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	reference_audios = gr.File(label="Reference Audios", file_types=[".wav", ".mp3"], file_count="multiple", height=150)
	gen_button = gr.Button("Generate")
	with gr.Column(scale=1):
	synthesized_audio = gr.Audio(label="Generate Audio", type="filepath")

	status = gr.Textbox(label="Status", interactive=False, lines=3)

	reference_audios.change(
	on_file_upload,
	inputs=[reference_audios],
	outputs=[reference_audios, status]
	)

	gen_button.click(
	fn=main,
	inputs=[
	reference_audios,
	text_prompt,
	denoise,
	avg_style,
	stabilize
	],
	outputs=[synthesized_audio, status]
	)

	gr.Examples(
	examples=[[[eg_voices[0]], eg_texts[0]], [eg_voices, eg_texts[1]]],
	inputs=[reference_audios, text_prompt],
	outputs=[synthesized_audio, reference_audios, status],
	fn=gen_example,
	cache_examples=False,
	label="Examples",
	run_on_click=True
	)

	demo.launch()