Spaces:

Respair
/

Darya_TTS

Running

File size: 23,445 Bytes


import spaces
import gradio as gr
import random
import os
import re
from gradio_client import Client, file

client = Client(os.environ['src'])

BASE_PATH = "Inference"
RU_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "random_texts.txt")
EN_RANDOM_TEXTS_PATH = os.path.join(BASE_PATH, "english_random_texts.txt")
RU_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "prompt.txt")
EN_PROMPT_TEXTS_PATH = os.path.join(BASE_PATH, "english_prompt.txt")

@spaces.GPU
def dummy():
    return
    
def load_texts(filepath):
    if not os.path.exists(os.path.dirname(filepath)) and os.path.dirname(filepath) != '':
         print(f"Warning: Directory '{os.path.dirname(filepath)}' not found.")
      
         if "random" in filepath: return ["Default example text."]
         else: return ["Speaker: Default prompt text."]
    try:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return [line.strip() for line in f if line.strip()]
        except UnicodeDecodeError:
            print(f"Warning: UTF-8 decode failed for {filepath}. Trying 'cp1251' (common for Russian)...")
            with open(filepath, 'r', encoding='cp1251') as f:
                 return [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"Warning: File not found - {filepath}")
        if "english" in filepath and "random" in filepath:
            return ["Example English text file not found."]
        elif "random" in filepath:
            return ["Пример русского текстового файла не найден."]
        elif "english" in filepath and "prompt" in filepath:
             return ["Speaker: Example English prompt file not found."]
        elif "prompt" in filepath:
             return ["Диктор: Пример русского файла подсказок не найден."]
        else:
             return ["Example text file not found."]
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return ["Error loading example texts."]

ru_random_texts_list = load_texts(RU_RANDOM_TEXTS_PATH)
en_random_texts_list = load_texts(EN_RANDOM_TEXTS_PATH)
ru_prompt_texts_list = load_texts(RU_PROMPT_TEXTS_PATH)
en_prompt_texts_list = load_texts(EN_PROMPT_TEXTS_PATH)

def create_example_dict(text_list):
    if not text_list or not isinstance(text_list[0], str):
        return {"No examples found": ""}
    return {f"{text[:30]}...": text for text in text_list}

ru_prompt_examples = create_example_dict(ru_prompt_texts_list)
en_prompt_examples = create_example_dict(en_prompt_texts_list)


VOICE_DIR = "./reference_sample_wavs"
try:
    if os.path.exists(VOICE_DIR) and os.path.isdir(VOICE_DIR):
        voicelist = sorted([v for v in os.listdir(VOICE_DIR) if os.path.isfile(os.path.join(VOICE_DIR, v)) and v.lower().endswith(('.wav', '.mp3', '.flac'))])
        if not voicelist:
           print(f"Warning: No compatible audio files found in {VOICE_DIR}. Dropdown will be empty.")
           voicelist = ["default.wav"]
    else:
        print(f"Warning: Voice directory not found or is not a directory: {VOICE_DIR}. Using placeholder list.")
        voicelist = ["anna_studio.wav", "boris_clear.wav", "female_neutral.wav", "male_deep.wav"]
except Exception as e:
    print(f"Error listing voices in {VOICE_DIR}: {e}")
    voicelist = ["error_loading_voices"]


def update_text_input_longform(preview_key, is_english):
    examples_dict = en_prompt_examples if is_english else ru_prompt_examples
    if preview_key in examples_dict:
        return examples_dict[preview_key]
    elif examples_dict:
         return list(examples_dict.values())[0]
    else:
         return "Selected example not found or examples failed to load."


def generate_random_spk(is_english):
    if is_english:
        rand_id = random.randint(0, 2006)
        print(f"Generated random English Speaker ID: {rand_id}")
        return rand_id
    else:
        rand_id = random.randint(0, 196)
        print(f"Generated random Russian Speaker ID: {rand_id}")
        return rand_id


def Client_Synthesize_Audio(text, voice, voice2_path, spk_id, vcsteps, embscale, beta, ros, t, language_checkbox):
    print("--- Client: Calling Synthesize_Audio ---")
    print(f"Text: {text[:50]}...")
    print(f"Default Voice: {voice}")
    print(f"Uploaded Voice Path: {voice2_path}")
    print(f"Speaker ID: {spk_id}")
    print(f"Steps: {vcsteps}, Scale: {embscale}, Beta: {beta}, RoS: {ros}, T: {t}")
    print(f"English Mode: {language_checkbox}")

    if voice2_path is not None:
        voice2_path = {"path": voice2_path, "meta": {"_type": "gradio.FileData"}}

    voice2_arg = voice2_path

    try:
        
        result = client.predict(
            text,
            voice,
            voice2_arg,
            spk_id,
            vcsteps,
            embscale,
            beta,
            ros,
            t,
            language_checkbox,
            api_name="/Synthesize_Audio"
        )
        print("--- Client: Synthesize_Audio call successful ---")
        return result
    except Exception as e:
        print(f"--- Client: Error calling Synthesize_Audio: {e} ---")
        import numpy as np
        return (44100, np.zeros(1))

def Client_PromptedSynth_Text(text, beta, t, diffusion_steps, embedding_scale, ros, language_checkbox):
    print("--- Client: Calling PromptedSynth_Text ---")
    print(f"Text: {text[:50]}...")
    print(f"Beta: {beta}, T: {t}, Steps: {diffusion_steps}, Scale: {embedding_scale}, RoS: {ros}")
    print(f"English Mode: {language_checkbox}")

    try:
        result = client.predict(
            text,
            beta,
            t,
            diffusion_steps,
            embedding_scale,
            ros,
            language_checkbox,
            api_name="/PromptedSynth_Text"
        )
        print("--- Client: PromptedSynth_Text call successful ---")
        return result
    except Exception as e:
        print(f"--- Client: Error calling PromptedSynth_Text: {e} ---")
        import numpy as np
        return (44100, np.zeros(1))


# Repo -> [Hugging Face - 🤗](https://huggingface.co./Respair/xxx) later
INTROTXT = """Update v0.01: Darya (RU) now supports style diffusion as well. """


with gr.Blocks() as audio_inf:
    with gr.Row():
        with gr.Column(scale=1):
            language_checkbox_audio = gr.Checkbox(label="English?", value=False,
                                            info="Tick for English synthesis, leave unchecked for Russian.")
            inp = gr.Textbox(label="Text",
                             info="Enter the text for voice-guided synthesis.",
                             value=ru_random_texts_list[0],
                             interactive=True,
                             scale=5)

            voice = gr.Dropdown(choices=voicelist,
                                label="Default Reference Voice (make sure it matches the language)",
                                info="Select a pre-defined reference voice.",
                                value=voicelist[7] if voicelist else None,
                                interactive=True)
            voice_2 = gr.Audio(label="Upload Your Audio Reference (Overrides Default Voice & Speaker ID)",
                               sources=["upload", "microphone"],
                               interactive=True,
                               type='filepath',
                               waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})


            with gr.Accordion("Advanced Parameters", open=False):

                spk_id = gr.Number(label="Speaker ID (randomly picking a sample based on the ID - may result in subpar / broken audio)",
                                   info="Input speaker ID (max 196 Ru / 2006 En) to use a random sample from that speaker on the server. 9999 disables.",
                                   value=9999,
                                   interactive=True)
    
                random_spk_btn = gr.Button("Random")

                beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1,
                                 label="Beta (Diffusion Strength vs. Reference)",
                                 info="Diffusion parameter. Higher means LESS like the reference audio. 0 disables diffusion.",
                                 interactive=True)
                multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1,
                                              label="Diffusion Steps",
                                              info="More steps can improve quality but increase inference time.",
                                              interactive=True)
                embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1,
                                     label="Embedding Scale (Intensity)",
                                     info="Impacts expressiveness. High values (> 1.5) might cause artifacts.",
                                     interactive=True)
                rate_of_speech = gr.Slider(minimum=0.5, maximum=2,
                                           value=1,
                                           step=0.1,
                                           label="Rate of Speech",
                                           info="Adjusts speech speed. 1.0 is normal.",
                                           interactive=True)

                t = gr.Slider(minimum=0.1, maximum=2, value=0.7, step=0.05,
                              label="T (Duration / Temperature)",
                              info="inflence of previous sentence on the current one",
                              interactive=True)

        with gr.Column(scale=1):
            btn = gr.Button("Synthesize (Voice Guided)", variant="primary")
            audio = gr.Audio(interactive=False,
                             label="Synthesized Audio",
                             waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})


    def update_audio_inf_defaults(is_english):
        new_text_value = en_random_texts_list[0] if is_english else ru_random_texts_list[0]
        new_spk_info = "Input speaker ID (max 2006 En) or use Randomize. 9999 disables." if is_english else "Input speaker ID (max 196 Ru) or use Randomize. 9999 disables."
        new_spk_val = 9999
        return gr.update(value=new_text_value), gr.update(info=new_spk_info, value=new_spk_val)


    language_checkbox_audio.change(update_audio_inf_defaults,
                                   inputs=[language_checkbox_audio],
                                   outputs=[inp, spk_id])

    random_spk_btn.click(fn=generate_random_spk, inputs=[language_checkbox_audio], outputs=spk_id)

    btn.click(Client_Synthesize_Audio,
              inputs=[inp, voice, voice_2, spk_id, multispeakersteps, embscale, beta, rate_of_speech, t, language_checkbox_audio],
              outputs=[audio],
              concurrency_limit=4)


with gr.Blocks() as longform:
 
    with gr.Row():
        with gr.Column(scale=1):
            language_checkbox_longform = gr.Checkbox(label="English?", value=False,
                                               info="Tick for English synthesis, leave unchecked for Russian.")
            inp_longform = gr.Textbox(label="Text",
                                      info="Enter text; check the format from the examples.",
                                      value=ru_prompt_texts_list[0],
                                      lines=5,
                                      interactive=True,
                                      scale=5)

            with gr.Row():
                example_dropdown = gr.Dropdown(choices=list(ru_prompt_examples.keys()),
                                               label="Example Prompts",
                                               info="Select an example to load into the text box.",
                                               value=list(ru_prompt_examples.keys())[0] if ru_prompt_examples else None,
                                               interactive=True)

            with gr.Accordion("Advanced Parameters", open=False):
                beta_longform = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1,
                                          label="Beta (Diffusion Strength vs. Semantic Encoder)",
                                          info="Diffusion parameter. Higher means LESS like the inferred style from text. 0 disables diffusion.",
                                          interactive=True)
                diffusion_steps_longform = gr.Slider(minimum=3, maximum=50, value=3, step=1,
                                                     label="Diffusion Steps",
                                                      info="More steps can improve diversity but increase inference time, it won't necessarily make it better.",
                                                     interactive=True)
                embedding_scale_longform = gr.Slider(minimum=1, maximum=10, value=1, step=0.1,
                                              label="Embedding Scale (Intensity)",
                                              info="Impacts expressiveness.",
                                              interactive=True)
                rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
                                                    label="Rate of Speech",
                                                    info="Adjusts speech speed. 1.0 is normal. it may not respond to tiny adjustments.",
                                                    interactive=True)
                t_longform = gr.Slider(minimum=0.1, maximum=2, value=0.8, step=0.1,
                                        label="T (Style Consistency - Primarily English)",
                                        info="Controls the influence of previous sentences' style on the current one.",
                                        interactive=True)


        with gr.Column(scale=1):
            btn_longform = gr.Button("Synthesize (Text Guided)", variant="primary")
            audio_longform = gr.Audio(interactive=False,
                                      label="Synthesized Audio",
                                      waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})


    def update_longform_defaults(is_english):
        examples_dict = en_prompt_examples if is_english else ru_prompt_examples
        new_choices = list(examples_dict.keys())
        new_value = new_choices[0] if new_choices else None
        new_text_value = examples_dict.get(new_value, list(examples_dict.values())[0] if examples_dict else ("Speaker: Example text." if is_english else "Диктор: Пример текста."))

        return gr.update(choices=new_choices, value=new_value), gr.update(value=new_text_value)

    language_checkbox_longform.change(update_longform_defaults,
                                      inputs=[language_checkbox_longform],
                                      outputs=[example_dropdown, inp_longform])

    example_dropdown.change(fn=update_text_input_longform,
                            inputs=[example_dropdown, language_checkbox_longform],
                            outputs=[inp_longform])

    btn_longform.click(Client_PromptedSynth_Text,
                        inputs=[inp_longform,
                                beta_longform,
                                t_longform,
                                diffusion_steps_longform,
                                embedding_scale_longform,
                                rate_of_speech_longform,
                                language_checkbox_longform],
                        outputs=[audio_longform],
                        concurrency_limit=4)

user_guide_html = f"""
<div style="background-color: rgba(30, 30, 30, 0.9); color: #f0f0f0; padding: 20px; border-radius: 10px; border: 1px solid #444;">
    <h2 style="border-bottom: 1px solid #555; padding-bottom: 5px;">Quick Notes:</h2>

    <p> This is run on a single RTX 3090. </p>
    <p> These networks can only generate natural speech with correct intonations (i.e generating NSFW, non-speech sounds, stutters etc. doesn't work) </p>
    <p> Make sure your inputs are not too short (more than a sentence long). </p>
    <p> I will gradually update here and -> <a href="https://github.com/Respaired/Project_Kalliope" target="_blank" style="color: #77abff;">Github</a> </p>
    <p>Everything in this demo & the repo (coming soon) is experimental. The main idea is just playing around with different things to see what works when you're limited to training on a pair of RTX 3090s.</p>
    <p>The data used for the english model is rough and pretty tough for any TTS model (think debates, real conversations, plus a little bit of cleaner professional performances). It mostly comes from public sources or third parties (no TOS signed). I'll probably write a blog post later with more details.</p>
    <p>So far I focused on English and Russian, more can be covered.</p>

    <hr style="border-color: #555; margin: 15px 0;">

    <h3 style="color: #a3ffc3;">Voice-Guided Tab (Using Audio Reference)</h3>
    <h4>Options:</h4>
    <ul>
        <li><b>Default Voices:</b> Pick one from the dropdown (these are stored locally).</li>
        <li><b>Upload Audio:</b> While the data isn't nearly enough for zero-shotting, you can still test your own samples. Make sure to decrease the beta if it didn't sound similar.</li>
        <li><b>Speaker ID:</b> Use a number (RU: 0-196, EN: 0-2006) to grab a random clip of that speaker from the server's dataset. Hit 'Randomize' to explore. (Invalid IDs use a default voice on the server).</li>
    </ul>
    <h4>Some notes:</h4>
    <ul>
        <li><b>Not all speakers are equal.</b> Randomized samples might give you a poor reference sometimes.</li>
        <li><b>IDs are not accurate. :</b> since the base model didn't require one and it was automatically generated so the same ID can give you different speakers.</li>
        <li><b>Play with Beta:</b> Values from 0.2 to 0.9 can work well. Higher Beta = LESS like the reference. It works great for some voices, breaks others. Please play with different values. (0 = diffusion off).</li>
    </ul>

    <hr style="border-color: #555; margin: 15px 0;">

    <h3 style="color: #a3ffc3;">Text-Guided Tab (Style is conditioned on the information and contents of the text)</h3>
    <ul>
        <li><b>Intuition:</b> it will Figure out the voice style just from the text itself (using semantic encoders). No audio needed, which makes it suitable for real-time use cases.</li>
        <li><b>Speaker Prefix:</b> For Russian, you can use 'Speaker_ + number:'. As for the English, you can use any names. Names were randomly assigned during the training of the Encoder.</li>
    </ul>

    <hr style="border-color: #555; margin: 15px 0;">

    <h3 style="color: #a3ffc3;">General Tips</h3>
    <ul>
        <li>Punctuation matters for intonation; don't use unsupported symbols.</li>
    </ul>
</div>
"""

with gr.Blocks() as info_tab:
    gr.HTML(user_guide_html) # Use HTML component

# --- Model Details Tab (Reformatted User Text) ---
# Convert Markdown-like text to basic HTML for styling
model_details_html = """
<div style="background-color: rgba(30, 30, 30, 0.9); color: #f0f0f0; padding: 20px; border-radius: 10px; border: 1px solid #444;">
    <h2 style="border-bottom: 1px solid #555; padding-bottom: 5px;">Model Details (The Guts)</h2>

    <hr style="border-color: #555; margin: 15px 0;">

    <h3 style="color: #e972ab;">Darya (Russian Model) - More Stable</h3>
    <p>Generally more controlled than the English one. That's also why in terms of acoustic quality it should sound much better.</p>
    <ul>
        <li><b>Setup:</b> Non-End-to-End (separate steps).</li>
        <li><b>Components:</b>
            <ul>
                <li>Style Encoder: Conformer-based.</li>
                <li>Duration Predictor: Conformer-based (with cross-attention).</li>
                <li>Semantic Encoder: <code>RuModernBERT-base</code> (for text-guidance).</li>
                <li>Diffusion Sampler: <b>**Yes**.</b></li>
            </ul>
        </li>
        <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a></li>
        <li><b>Training:</b> ~200K steps on ~320 hours of Russian data (mix of conversation & narration, hundreds of speakers).</li>
        <li><b>Size:</b> Lightweight (~< 200M params).</li>
        <li><b>Specs:</b> 44.1kHz output, 128 mel bins.</li>
    </ul>

    <hr style="border-color: #555; margin: 15px 0;">

    <h3 style="color: #e972ab;">Kalliope (English Model) - Wild</h3>
    <p>More expressive potential, but also less predictable. Showed signs of overfitting on the noisy data.</p>
    <ul>
        <li><b>Setup:</b> Non-End-to-End.</li>
        <li><b>Components:</b>
            <ul>
                <li>Style Encoder: Conformer-based.</li>
                <li>Text Encoder: <code>ConvNextV2</code>.</li>
                <li>Duration Predictor: Conformer-based (with cross-attention).</li>
                <li>Acoustic Decoder: Conformer-based.</li>
                <li>Semantic Encoder: <code>DeBERTa V3 Base</code> (for text-guided).</li>
                <li>Diffusion Sampler: <b>Yes.</b></li>
            </ul>
        </li>
        <li><b>Vocoder:</b> <a href="https://github.com/Respaired/RiFornet_Vocoder" target="_blank" style="color: #77abff;">RiFornet</a>.</li>
        <li><b>Training:</b> ~100K steps on ~300-400 hours of <i>very complex & noisy</i> English data (conversational, whisper, narration, wide emotion range).</li>
        <li><b>Size:</b> Bigger (~1.2B params total, but not all active at once - training was surprisingly doable). Hidden dim 1024, Style vector 512.</li>
        <li><b>Specs:</b> 44.1kHz output, 128 mel bins (but more than half the dataset were 22-24khz or even phone-call quality)</li>
    </ul>

    <hr style="border-color: #555; margin: 15px 0;">

    <p><i>More details might show up in a blog post later.</i></p>
</div>
"""

with gr.Blocks() as model_details_tab:
    gr.HTML(model_details_html) 

# theme = gr.themes.Base(
#     font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
# )

# app = gr.TabbedInterface(
#     [longform, audio_inf, info_tab, model_details_tab],
#     ['Text-guided Synthesis', 'Voice-guided Synthesis', 'Intuition & Tips', 'Model Details'],
#     title="The Poor Man's TTS (Experimental)",
#     theme="Respair/[email protected]"
# )


# if __name__ == "__main__":
#     print("Launching Client Gradio App...")
#     app.queue(api_open=False, max_size=15).launch(show_api=False, share=True)




with gr.Blocks(title="The Poor Man's TTS (Experimental 🔧)", theme="Respair/[email protected]") as demo:
    # gr.DuplicateButton("Duplicate Space")
    # gr.Markdown(INTROTXT)


    gr.TabbedInterface(
        [audio_inf, longform, info_tab, model_details_tab],
        ['Reference-guided Synthesis','Text-guided Synthesis', 'Intuition & Tips', 'Model Details'],
        title="The Poor Man's TTS (Experimental)",
        theme="Respair/[email protected]"
    )


if __name__ == "__main__":
    demo.queue(api_open=False, max_size=15).launch(show_api=False, share=False)