Spaces:

naonauno
/

dialogs2-factory

Paused

App Files Files Community

naonauno commited on Jan 15

Commit

2bc810c

verified ·

1 Parent(s): d4d1cf4

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -66

app.py CHANGED Viewed

@@ -10,13 +10,18 @@ import Amphion.models.vc.vevo.vevo_utils as vevo_utils
 from huggingface_hub import snapshot_download
 def load_model():
     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
     # Content Tokenizer
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
-        cache_dir="./ckpts/Vevo",
         allow_patterns=["tokenizer/vq32/*"],
     )
     content_tokenizer_ckpt_path = os.path.join(
@@ -27,7 +32,7 @@ def load_model():
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
-        cache_dir="./ckpts/Vevo",
         allow_patterns=["tokenizer/vq8192/*"],
     )
     content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
@@ -36,7 +41,7 @@ def load_model():
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
-        cache_dir="./ckpts/Vevo",
         allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
     )
     ar_cfg_path = "./config/Vq32ToVq8192.json"
@@ -46,7 +51,7 @@ def load_model():
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
-        cache_dir="./ckpts/Vevo",
         allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
     )
     fmt_cfg_path = "./config/Vq8192ToMels.json"
@@ -56,12 +61,13 @@ def load_model():
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
-        cache_dir="./ckpts/Vevo",
         allow_patterns=["acoustic_modeling/Vocoder/*"],
     )
     vocoder_cfg_path = "./Amphion/models/vc/vevo/config/Vocoder.json"
     vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
     pipeline = vevo_utils.VevoInferencePipeline(
         content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
         content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
@@ -73,6 +79,7 @@ def load_model():
         vocoder_ckpt_path=vocoder_ckpt_path,
         device=device
     )
     return pipeline
 def convert_to_wav(audio_path):
@@ -94,6 +101,10 @@ def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio,
                  src_text, ref_text, src_language, ref_language, steps,
                  progress=gr.Progress()):
     try:
         # Convert uploaded audio files to WAV if needed
         if content_audio:
             content_path = convert_to_wav(content_audio)
@@ -110,10 +121,12 @@ def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio,
         else:
             ref_timbre_path = None
         # Run inference based on mode
         if mode == 'voice':
             if not all([content_path, ref_style_path, ref_timbre_path]):
-                raise ValueError("Voice mode requires all audio inputs")
             gen_audio = inference_pipeline.inference_ar_and_fm(
                 src_wav_path=content_path,
@@ -125,7 +138,7 @@ def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio,
         elif mode == 'timbre':
             if not all([content_path, ref_timbre_path]):
-                raise ValueError("Timbre mode requires source and timbre reference audio")
             gen_audio = inference_pipeline.inference_fm(
                 src_wav_path=content_path,
@@ -134,8 +147,8 @@ def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio,
             )
         elif mode == 'tts':
-            if not all([ref_style_path, ref_timbre_path, src_text]):
-                raise ValueError("TTS mode requires style audio, timbre audio, and source text")
             gen_audio = inference_pipeline.inference_ar_and_fm(
                 src_wav_path=None,
@@ -147,18 +160,17 @@ def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio,
                 style_ref_wav_text_language=ref_language
             )
         # Save and return the generated audio
-        output_path = "output.wav"
         vevo_utils.save_audio(gen_audio, target_sample_rate=48000, output_path=output_path)
         return output_path
     except Exception as e:
         raise gr.Error(str(e))
 # Initialize the model
-print("Loading model...")
 inference_pipeline = load_model()
-print("Model loaded successfully!")
 # Create the Gradio interface
 with gr.Blocks(title="Vevo Voice Conversion") as demo:
@@ -168,52 +180,58 @@ with gr.Blocks(title="Vevo Voice Conversion") as demo:
         mode = gr.Radio(
             choices=["voice", "timbre", "tts"],
             value="timbre",
-            label="Inference Mode"
         )
     with gr.Row():
         with gr.Column():
-            content_audio = gr.Audio(
-                label="Source Audio",
-                type="filepath"
-            )
-            ref_style_audio = gr.Audio(
-                label="Reference Style Audio",
-                type="filepath"
-            )
-            ref_timbre_audio = gr.Audio(
-                label="Reference Timbre Audio",
-                type="filepath"
-            )
         with gr.Column():
-            src_text = gr.Textbox(
-                label="Source Text",
-                placeholder="Enter text for TTS mode",
-                visible=False
-            )
-            ref_text = gr.Textbox(
-                label="Reference Style Text",
-                placeholder="Optional: Enter reference text",
-                visible=False
-            )
-            src_language = gr.Dropdown(
-                choices=["en", "zh"],
-                value="en",
-                label="Source Language",
-                visible=False
-            )
-            ref_language = gr.Dropdown(
-                choices=["en", "zh"],
-                value="en",
-                label="Reference Language",
-                visible=False
-            )
     with gr.Row():
         steps = gr.Slider(
@@ -229,24 +247,18 @@ with gr.Blocks(title="Vevo Voice Conversion") as demo:
         output_audio = gr.Audio(label="Generated Audio")
     # Handle visibility of components based on mode
-    def update_visibility(mode):
         is_tts = mode == "tts"
-        is_voice = mode == "voice"
-        is_timbre = mode == "timbre"
         return {
-            content_audio: not is_tts,
-            ref_style_audio: not is_timbre,
-            src_text: is_tts,
-            ref_text: is_tts,
-            src_language: is_tts,
-            ref_language: is_tts
         }
     mode.change(
-        fn=update_visibility,
         inputs=[mode],
-        outputs=[content_audio, ref_style_audio, src_text, ref_text, src_language, ref_language]
     )
     # Handle generation
@@ -267,4 +279,4 @@ with gr.Blocks(title="Vevo Voice Conversion") as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 from huggingface_hub import snapshot_download
 def load_model():
+    print("Loading model...")
     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    print(f"Using device: {device}")
+    cache_dir = "./ckpts/Vevo"
+    os.makedirs(cache_dir, exist_ok=True)
     # Content Tokenizer
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
+        cache_dir=cache_dir,
         allow_patterns=["tokenizer/vq32/*"],
     )
     content_tokenizer_ckpt_path = os.path.join(
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
+        cache_dir=cache_dir,
         allow_patterns=["tokenizer/vq8192/*"],
     )
     content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
+        cache_dir=cache_dir,
         allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
     )
     ar_cfg_path = "./config/Vq32ToVq8192.json"
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
+        cache_dir=cache_dir,
         allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
     )
     fmt_cfg_path = "./config/Vq8192ToMels.json"
     local_dir = snapshot_download(
         repo_id="amphion/Vevo",
         repo_type="model",
+        cache_dir=cache_dir,
         allow_patterns=["acoustic_modeling/Vocoder/*"],
     )
     vocoder_cfg_path = "./Amphion/models/vc/vevo/config/Vocoder.json"
     vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+    print("Initializing pipeline...")
     pipeline = vevo_utils.VevoInferencePipeline(
         content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
         content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
         vocoder_ckpt_path=vocoder_ckpt_path,
         device=device
     )
+    print("Model loaded successfully!")
     return pipeline
 def convert_to_wav(audio_path):
                  src_text, ref_text, src_language, ref_language, steps,
                  progress=gr.Progress()):
     try:
+        output_dir = "outputs"
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, "output.wav")
         # Convert uploaded audio files to WAV if needed
         if content_audio:
             content_path = convert_to_wav(content_audio)
         else:
             ref_timbre_path = None
+        progress(0.2, "Processing audio...")
         # Run inference based on mode
         if mode == 'voice':
             if not all([content_path, ref_style_path, ref_timbre_path]):
+                raise gr.Error("Voice mode requires all audio inputs")
             gen_audio = inference_pipeline.inference_ar_and_fm(
                 src_wav_path=content_path,
         elif mode == 'timbre':
             if not all([content_path, ref_timbre_path]):
+                raise gr.Error("Timbre mode requires source and timbre reference audio")
             gen_audio = inference_pipeline.inference_fm(
                 src_wav_path=content_path,
             )
         elif mode == 'tts':
+            if not all([ref_style_path, ref_timbre_path]) or not src_text:
+                raise gr.Error("TTS mode requires style audio, timbre audio, and source text")
             gen_audio = inference_pipeline.inference_ar_and_fm(
                 src_wav_path=None,
                 style_ref_wav_text_language=ref_language
             )
+        progress(0.8, "Saving generated audio...")
         # Save and return the generated audio
         vevo_utils.save_audio(gen_audio, target_sample_rate=48000, output_path=output_path)
         return output_path
     except Exception as e:
         raise gr.Error(str(e))
 # Initialize the model
 inference_pipeline = load_model()
 # Create the Gradio interface
 with gr.Blocks(title="Vevo Voice Conversion") as demo:
         mode = gr.Radio(
             choices=["voice", "timbre", "tts"],
             value="timbre",
+            label="Inference Mode",
+            interactive=True
         )
     with gr.Row():
         with gr.Column():
+            with gr.Group(visible=True) as audio_inputs:
+                content_audio = gr.Audio(
+                    label="Source Audio",
+                    type="filepath",
+                    interactive=True
+                )
+                ref_style_audio = gr.Audio(
+                    label="Reference Style Audio",
+                    type="filepath",
+                    interactive=True
+                )
+                ref_timbre_audio = gr.Audio(
+                    label="Reference Timbre Audio",
+                    type="filepath",
+                    interactive=True
+                )
         with gr.Column():
+            with gr.Group(visible=False) as text_inputs:
+                src_text = gr.Textbox(
+                    label="Source Text",
+                    placeholder="Enter text for TTS mode",
+                    interactive=True
+                )
+                ref_text = gr.Textbox(
+                    label="Reference Style Text",
+                    placeholder="Optional: Enter reference text",
+                    interactive=True
+                )
+                src_language = gr.Dropdown(
+                    choices=["en", "zh"],
+                    value="en",
+                    label="Source Language",
+                    interactive=True
+                )
+                ref_language = gr.Dropdown(
+                    choices=["en", "zh"],
+                    value="en",
+                    label="Reference Language",
+                    interactive=True
+                )
     with gr.Row():
         steps = gr.Slider(
         output_audio = gr.Audio(label="Generated Audio")
     # Handle visibility of components based on mode
+    def update_interface(mode):
         is_tts = mode == "tts"
         return {
+            audio_inputs: not is_tts,
+            text_inputs: is_tts,
+            ref_style_audio: mode != "timbre",
         }
     mode.change(
+        fn=update_interface,
         inputs=[mode],
+        outputs=[audio_inputs, text_inputs, ref_style_audio]
     )
     # Handle generation
     )
 if __name__ == "__main__":
+    demo.queue().launch()