Spaces:

naonauno
/

dialogs2-factory

Paused

File size: 9,655 Bytes

import os
import sys
import torch
import gradio as gr
from pydub import AudioSegment
import mimetypes

sys.path.append('./Amphion')
import Amphion.models.vc.vevo.vevo_utils as vevo_utils
from huggingface_hub import snapshot_download

def load_model():
    print("Loading model...")
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(f"Using device: {device}")
    
    cache_dir = "./ckpts/Vevo"
    os.makedirs(cache_dir, exist_ok=True)
    
    # Content Tokenizer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir=cache_dir,
        allow_patterns=["tokenizer/vq32/*"],
    )
    content_tokenizer_ckpt_path = os.path.join(
        local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
    )

    # Content-Style Tokenizer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir=cache_dir,
        allow_patterns=["tokenizer/vq8192/*"],
    )
    content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

    # Autoregressive Transformer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir=cache_dir,
        allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
    )
    ar_cfg_path = "./config/Vq32ToVq8192.json"
    ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")

    # Flow Matching Transformer
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir=cache_dir,
        allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
    )
    fmt_cfg_path = "./config/Vq8192ToMels.json"
    fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

    # Vocoder
    local_dir = snapshot_download(
        repo_id="amphion/Vevo",
        repo_type="model",
        cache_dir=cache_dir,
        allow_patterns=["acoustic_modeling/Vocoder/*"],
    )
    vocoder_cfg_path = "./Amphion/models/vc/vevo/config/Vocoder.json"
    vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

    print("Initializing pipeline...")
    pipeline = vevo_utils.VevoInferencePipeline(
        content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
        content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
        ar_cfg_path=ar_cfg_path,
        ar_ckpt_path=ar_ckpt_path,
        fmt_cfg_path=fmt_cfg_path,
        fmt_ckpt_path=fmt_ckpt_path,
        vocoder_cfg_path=vocoder_cfg_path,
        vocoder_ckpt_path=vocoder_ckpt_path,
        device=device
    )
    print("Model loaded successfully!")
    return pipeline

def convert_to_wav(audio_path):
    if audio_path is None:
        return None
        
    mime, _ = mimetypes.guess_type(audio_path)
    if mime == 'audio/wav' or mime == 'audio/x-wav':
        return audio_path
    elif mime == 'audio/mpeg':
        seg = AudioSegment.from_mp3(audio_path)
        wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
        seg.export(wav_path, format="wav")
        return wav_path
    else:
        raise ValueError(f"Unsupported audio format: {mime}")

def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio, 
                 src_text, ref_text, src_language, ref_language, steps,
                 progress=gr.Progress()):
    try:
        output_dir = "outputs"
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, "output.wav")

        # Convert uploaded audio files to WAV if needed
        if content_audio:
            content_path = convert_to_wav(content_audio)
        else:
            content_path = None
            
        if ref_style_audio:
            ref_style_path = convert_to_wav(ref_style_audio)
        else:
            ref_style_path = None
            
        if ref_timbre_audio:
            ref_timbre_path = convert_to_wav(ref_timbre_audio)
        else:
            ref_timbre_path = None

        progress(0.2, "Processing audio...")

        # Run inference based on mode
        if mode == 'voice':
            if not all([content_path, ref_style_path, ref_timbre_path]):
                raise gr.Error("Voice mode requires all audio inputs")
                
            gen_audio = inference_pipeline.inference_ar_and_fm(
                src_wav_path=content_path,
                src_text=None,
                style_ref_wav_path=ref_style_path,
                timbre_ref_wav_path=ref_timbre_path,
                flow_matching_steps=steps
            )
            
        elif mode == 'timbre':
            if not all([content_path, ref_timbre_path]):
                raise gr.Error("Timbre mode requires source and timbre reference audio")
                
            gen_audio = inference_pipeline.inference_fm(
                src_wav_path=content_path,
                timbre_ref_wav_path=ref_timbre_path,
                flow_matching_steps=steps
            )
            
        elif mode == 'tts':
            if not all([ref_style_path, ref_timbre_path]) or not src_text:
                raise gr.Error("TTS mode requires style audio, timbre audio, and source text")
                
            gen_audio = inference_pipeline.inference_ar_and_fm(
                src_wav_path=None,
                src_text=src_text,
                style_ref_wav_path=ref_style_path,
                timbre_ref_wav_path=ref_timbre_path,
                style_ref_wav_text=ref_text if ref_text else None,
                src_text_language=src_language,
                style_ref_wav_text_language=ref_language
            )
        
        progress(0.8, "Saving generated audio...")
        
        # Save and return the generated audio
        vevo_utils.save_audio(gen_audio, target_sample_rate=48000, output_path=output_path)
        return output_path

    except Exception as e:
        raise gr.Error(str(e))

# Initialize the model
inference_pipeline = load_model()

# Create the Gradio interface
with gr.Blocks(title="Vevo Voice Conversion") as demo:
    gr.Markdown("# Vevo Voice Conversion")
    
    with gr.Row():
        mode = gr.Radio(
            choices=["voice", "timbre", "tts"],
            value="timbre",
            label="Inference Mode",
            interactive=True
        )

    with gr.Tabs():
        with gr.TabItem("Audio Inputs"):
            content_audio = gr.Audio(
                label="Source Audio",
                type="filepath",
                interactive=True
            )
            
            ref_style_audio = gr.Audio(
                label="Reference Style Audio",
                type="filepath",
                interactive=True
            )
            
            ref_timbre_audio = gr.Audio(
                label="Reference Timbre Audio",
                type="filepath",
                interactive=True
            )
        
        with gr.TabItem("Text Inputs (TTS Mode)"):
            src_text = gr.Textbox(
                label="Source Text",
                placeholder="Enter text for TTS mode",
                interactive=True
            )
            
            ref_text = gr.Textbox(
                label="Reference Style Text (Optional)",
                placeholder="Enter reference text",
                interactive=True
            )
            
            with gr.Row():
                src_language = gr.Dropdown(
                    choices=["en", "zh"],
                    value="en",
                    label="Source Language",
                    interactive=True
                )
                
                ref_language = gr.Dropdown(
                    choices=["en", "zh"],
                    value="en",
                    label="Reference Language",
                    interactive=True
                )
    
    with gr.Row():
        steps = gr.Slider(
            minimum=1,
            maximum=64,
            value=32,
            step=1,
            label="Flow Matching Steps"
        )
    
    with gr.Row():
        with gr.Column():
            submit_btn = gr.Button("Generate")
            error_box = gr.Textbox(label="Status", interactive=False)
        output_audio = gr.Audio(label="Generated Audio")
    
    def process_with_error_handling(*args):
        try:
            result = process_audio(*args)
            error_box.update(value="Success!")
            return [result, "Success!"]
        except Exception as e:
            error_msg = str(e)
            return [None, error_msg]
    
    submit_btn.click(
        fn=process_with_error_handling,
        inputs=[
            mode,
            content_audio,
            ref_style_audio,
            ref_timbre_audio,
            src_text,
            ref_text,
            src_language,
            ref_language,
            steps
        ],
        outputs=[output_audio, error_box]
    )

    # Example usage text
    gr.Markdown("""
    ## Quick Start Guide
    
    1. Select your mode:
       - **Voice**: Full voice conversion (style + timbre)
       - **Timbre**: Only voice timbre conversion
       - **TTS**: Text-to-speech with voice cloning
    
    2. For Voice/Timbre modes:
       - Upload source audio (what you want to convert)
       - Upload reference audio(s)
    
    3. For TTS mode:
       - Enter your text
       - Select language
       - Upload reference audio(s)
    
    4. Adjust steps slider:
       - Higher values = better quality but slower
       - Lower values = faster but lower quality
    
    5. Click Generate and wait for processing
    """)

if __name__ == "__main__":
    demo.queue().launch()