toolbox-tts / app.py
kennethli319's picture
update tts
f690a5a
raw
history blame
1.7 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import numpy as np
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import MixerTTSModel
from transformers import pipeline
Audio(output["audio"], rate=output["sampling_rate"])
# spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx")
# model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx")
spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker")
spec_generator.eval()
voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch")
voc_model.eval()
pipe = pipeline("text-to-speech", model="suno/bark-small")
def greet(name):
return "Hello " + name + "!!"
def generate_tts(text: str, speaker: int = 0):
sr = 44100
# parsed = spec_generator.parse(text)
# spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker)
# audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram)
output = pipe(text)
# with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# torchaudio.save(fp.name, audio.to('cpu'), sample_rate=sr)
# return fp.name
return (output["sampling_rate"], output["audio"])
def run():
demo = gr.Interface(
fn=generate_tts,
inputs=[gr.Textbox(value="This is a test.", label="Text to Synthesize"),
gr.Slider(0, 10, step=1, label="Speaker")],
outputs=gr.Audio(label="Output", type="numpy"),
)
demo.launch(server_name="0.0.0.0", server_port=7860)
if __name__ == "__main__":
run()