Spaces:

amphion
/

Vevo

Running on Zero

App Files Files Community

积极的屁孩 commited on 10 days ago

Commit

a8377f8

1 Parent(s): 980462e

add space

Browse files

Files changed (1) hide show

app.py +92 -93

app.py CHANGED Viewed

@@ -5,12 +5,12 @@ import site
 import json
 import torch
 import gradio as gr
-import gradio.spaces as spaces
 import torchaudio
 import numpy as np
 from huggingface_hub import snapshot_download, hf_hub_download
 import subprocess
 import re
 def install_espeak():
     """检测并安装espeak-ng依赖"""
@@ -351,6 +351,7 @@ def get_pipeline(pipeline_type):
     return inference_pipeline
 # 实现VEVO功能函数
 def vevo_style(content_wav, style_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
@@ -433,6 +434,7 @@ def vevo_style(content_wav, style_wav):
         traceback.print_exc()
         raise e
 def vevo_timbre(content_wav, reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_reference_path = "wav/temp_reference.wav"
@@ -526,6 +528,7 @@ def vevo_timbre(content_wav, reference_wav):
         traceback.print_exc()
         raise e
 def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
@@ -647,6 +650,7 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
         traceback.print_exc()
         raise e
 def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
     temp_ref_path = "wav/temp_ref.wav"
     temp_timbre_path = "wav/temp_timbre.wav"
@@ -750,98 +754,93 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
         raise e
 # 创建Gradio界面
-@spaces.GPU
-def run_app():
-    with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
-        gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
-        # 添加链接标签行
-        with gr.Row(elem_id="links_row"):
-            gr.HTML("""
-            <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
-                <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
-                    <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
-                </a>
-                <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
-                    <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
-                </a>
-                <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
-                    <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
-                </a>
-                <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
-                    <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
-                </a>
-            </div>
-            """)
-        with gr.Tab("Vevo-Timbre"):
-            gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
-            with gr.Row():
-                with gr.Column():
-                    timbre_content = gr.Audio(label="Source Audio", type="numpy")
-                    timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
-                    timbre_button = gr.Button("Generate")
-                with gr.Column():
-                    timbre_output = gr.Audio(label="Result")
-            timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
-        with gr.Tab("Vevo-Style"):
-            gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
-            with gr.Row():
-                with gr.Column():
-                    style_content = gr.Audio(label="Source Audio", type="numpy")
-                    style_reference = gr.Audio(label="Style Reference", type="numpy")
-                    style_button = gr.Button("Generate")
-                with gr.Column():
-                    style_output = gr.Audio(label="Result")
-            style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
-        with gr.Tab("Vevo-Voice"):
-            gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
-            with gr.Row():
-                with gr.Column():
-                    voice_content = gr.Audio(label="Source Audio", type="numpy")
-                    voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
-                    voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
-                    voice_button = gr.Button("Generate")
-                with gr.Column():
-                    voice_output = gr.Audio(label="Result")
-            voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
-        with gr.Tab("Vevo-TTS"):
-            gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
-            with gr.Row():
-                with gr.Column():
-                    tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
-                    tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
-                    tts_reference = gr.Audio(label="Style Reference", type="numpy")
-                    tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
-                    tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
-                    tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
-                    tts_button = gr.Button("Generate")
-                with gr.Column():
-                    tts_output = gr.Audio(label="Result")
-            tts_button.click(
-                vevo_tts,
-                inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
-                outputs=tts_output
-            )
-        gr.Markdown("""
-        ## About VEVO
-        VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
-        1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
-        2. **Vevo-Timbre**: Maintains style but transfers timbre
-        3. **Vevo-Voice**: Transfers both style and timbre with separate references
-        4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
-        For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
         """)
-    # 启动应用
-    demo.launch()
-# Run the app
-run_app()

 import json
 import torch
 import gradio as gr
 import torchaudio
 import numpy as np
 from huggingface_hub import snapshot_download, hf_hub_download
 import subprocess
 import re
+import spaces
 def install_espeak():
     """检测并安装espeak-ng依赖"""
     return inference_pipeline
 # 实现VEVO功能函数
+@spaces.GPU()
 def vevo_style(content_wav, style_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
         traceback.print_exc()
         raise e
+@spaces.GPU()
 def vevo_timbre(content_wav, reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_reference_path = "wav/temp_reference.wav"
         traceback.print_exc()
         raise e
+@spaces.GPU()
 def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
         traceback.print_exc()
         raise e
+@spaces.GPU()
 def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
     temp_ref_path = "wav/temp_ref.wav"
     temp_timbre_path = "wav/temp_timbre.wav"
         raise e
 # 创建Gradio界面
+with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
+    gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
+    # 添加链接标签行
+    with gr.Row(elem_id="links_row"):
+        gr.HTML("""
+        <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
+            <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
+                <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
+            </a>
+            <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
+                <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
+            </a>
+            <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
+                <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
+            </a>
+            <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
+                <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
+            </a>
+        </div>
         """)
+    with gr.Tab("Vevo-Timbre"):
+        gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
+        with gr.Row():
+            with gr.Column():
+                timbre_content = gr.Audio(label="Source Audio", type="numpy")
+                timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                timbre_button = gr.Button("Generate")
+            with gr.Column():
+                timbre_output = gr.Audio(label="Result")
+        timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
+    with gr.Tab("Vevo-Style"):
+        gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
+        with gr.Row():
+            with gr.Column():
+                style_content = gr.Audio(label="Source Audio", type="numpy")
+                style_reference = gr.Audio(label="Style Reference", type="numpy")
+                style_button = gr.Button("Generate")
+            with gr.Column():
+                style_output = gr.Audio(label="Result")
+        style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
+    with gr.Tab("Vevo-Voice"):
+        gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
+        with gr.Row():
+            with gr.Column():
+                voice_content = gr.Audio(label="Source Audio", type="numpy")
+                voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
+                voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                voice_button = gr.Button("Generate")
+            with gr.Column():
+                voice_output = gr.Audio(label="Result")
+        voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
+    with gr.Tab("Vevo-TTS"):
+        gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
+        with gr.Row():
+            with gr.Column():
+                tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
+                tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
+                tts_reference = gr.Audio(label="Style Reference", type="numpy")
+                tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
+                tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
+                tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                tts_button = gr.Button("Generate")
+            with gr.Column():
+                tts_output = gr.Audio(label="Result")
+        tts_button.click(
+            vevo_tts,
+            inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
+            outputs=tts_output
+        )
+    gr.Markdown("""
+    ## About VEVO
+    VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
+    1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
+    2. **Vevo-Timbre**: Maintains style but transfers timbre
+    3. **Vevo-Voice**: Transfers both style and timbre with separate references
+    4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
+    For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
+    """)
+# 启动应用
+demo.launch()