Spaces:
Running
on
Zero
Running
on
Zero
积极的屁孩
commited on
Commit
·
a8377f8
1
Parent(s):
980462e
add space
Browse files
app.py
CHANGED
@@ -5,12 +5,12 @@ import site
|
|
5 |
import json
|
6 |
import torch
|
7 |
import gradio as gr
|
8 |
-
import gradio.spaces as spaces
|
9 |
import torchaudio
|
10 |
import numpy as np
|
11 |
from huggingface_hub import snapshot_download, hf_hub_download
|
12 |
import subprocess
|
13 |
import re
|
|
|
14 |
|
15 |
def install_espeak():
|
16 |
"""检测并安装espeak-ng依赖"""
|
@@ -351,6 +351,7 @@ def get_pipeline(pipeline_type):
|
|
351 |
return inference_pipeline
|
352 |
|
353 |
# 实现VEVO功能函数
|
|
|
354 |
def vevo_style(content_wav, style_wav):
|
355 |
temp_content_path = "wav/temp_content.wav"
|
356 |
temp_style_path = "wav/temp_style.wav"
|
@@ -433,6 +434,7 @@ def vevo_style(content_wav, style_wav):
|
|
433 |
traceback.print_exc()
|
434 |
raise e
|
435 |
|
|
|
436 |
def vevo_timbre(content_wav, reference_wav):
|
437 |
temp_content_path = "wav/temp_content.wav"
|
438 |
temp_reference_path = "wav/temp_reference.wav"
|
@@ -526,6 +528,7 @@ def vevo_timbre(content_wav, reference_wav):
|
|
526 |
traceback.print_exc()
|
527 |
raise e
|
528 |
|
|
|
529 |
def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
530 |
temp_content_path = "wav/temp_content.wav"
|
531 |
temp_style_path = "wav/temp_style.wav"
|
@@ -647,6 +650,7 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
|
647 |
traceback.print_exc()
|
648 |
raise e
|
649 |
|
|
|
650 |
def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
|
651 |
temp_ref_path = "wav/temp_ref.wav"
|
652 |
temp_timbre_path = "wav/temp_timbre.wav"
|
@@ -750,98 +754,93 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
|
|
750 |
raise e
|
751 |
|
752 |
# 创建Gradio界面
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
771 |
-
|
772 |
-
</a>
|
773 |
-
</div>
|
774 |
-
""")
|
775 |
-
|
776 |
-
with gr.Tab("Vevo-Timbre"):
|
777 |
-
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
778 |
-
with gr.Row():
|
779 |
-
with gr.Column():
|
780 |
-
timbre_content = gr.Audio(label="Source Audio", type="numpy")
|
781 |
-
timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
782 |
-
timbre_button = gr.Button("Generate")
|
783 |
-
with gr.Column():
|
784 |
-
timbre_output = gr.Audio(label="Result")
|
785 |
-
timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
|
786 |
-
|
787 |
-
with gr.Tab("Vevo-Style"):
|
788 |
-
gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
|
789 |
-
with gr.Row():
|
790 |
-
with gr.Column():
|
791 |
-
style_content = gr.Audio(label="Source Audio", type="numpy")
|
792 |
-
style_reference = gr.Audio(label="Style Reference", type="numpy")
|
793 |
-
style_button = gr.Button("Generate")
|
794 |
-
with gr.Column():
|
795 |
-
style_output = gr.Audio(label="Result")
|
796 |
-
style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
|
797 |
-
|
798 |
-
with gr.Tab("Vevo-Voice"):
|
799 |
-
gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
|
800 |
-
with gr.Row():
|
801 |
-
with gr.Column():
|
802 |
-
voice_content = gr.Audio(label="Source Audio", type="numpy")
|
803 |
-
voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
|
804 |
-
voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
805 |
-
voice_button = gr.Button("Generate")
|
806 |
-
with gr.Column():
|
807 |
-
voice_output = gr.Audio(label="Result")
|
808 |
-
voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
with gr.Tab("Vevo-TTS"):
|
813 |
-
gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
|
814 |
-
with gr.Row():
|
815 |
-
with gr.Column():
|
816 |
-
tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
|
817 |
-
tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
|
818 |
-
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
819 |
-
tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
|
820 |
-
tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
|
821 |
-
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
822 |
-
tts_button = gr.Button("Generate")
|
823 |
-
with gr.Column():
|
824 |
-
tts_output = gr.Audio(label="Result")
|
825 |
-
|
826 |
-
tts_button.click(
|
827 |
-
vevo_tts,
|
828 |
-
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
|
829 |
-
outputs=tts_output
|
830 |
-
)
|
831 |
-
|
832 |
-
gr.Markdown("""
|
833 |
-
## About VEVO
|
834 |
-
VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
|
835 |
-
1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
|
836 |
-
2. **Vevo-Timbre**: Maintains style but transfers timbre
|
837 |
-
3. **Vevo-Voice**: Transfers both style and timbre with separate references
|
838 |
-
4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
|
839 |
-
|
840 |
-
For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
|
841 |
""")
|
842 |
|
843 |
-
|
844 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
845 |
|
846 |
-
#
|
847 |
-
|
|
|
5 |
import json
|
6 |
import torch
|
7 |
import gradio as gr
|
|
|
8 |
import torchaudio
|
9 |
import numpy as np
|
10 |
from huggingface_hub import snapshot_download, hf_hub_download
|
11 |
import subprocess
|
12 |
import re
|
13 |
+
import spaces
|
14 |
|
15 |
def install_espeak():
|
16 |
"""检测并安装espeak-ng依赖"""
|
|
|
351 |
return inference_pipeline
|
352 |
|
353 |
# 实现VEVO功能函数
|
354 |
+
@spaces.GPU()
|
355 |
def vevo_style(content_wav, style_wav):
|
356 |
temp_content_path = "wav/temp_content.wav"
|
357 |
temp_style_path = "wav/temp_style.wav"
|
|
|
434 |
traceback.print_exc()
|
435 |
raise e
|
436 |
|
437 |
+
@spaces.GPU()
|
438 |
def vevo_timbre(content_wav, reference_wav):
|
439 |
temp_content_path = "wav/temp_content.wav"
|
440 |
temp_reference_path = "wav/temp_reference.wav"
|
|
|
528 |
traceback.print_exc()
|
529 |
raise e
|
530 |
|
531 |
+
@spaces.GPU()
|
532 |
def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
533 |
temp_content_path = "wav/temp_content.wav"
|
534 |
temp_style_path = "wav/temp_style.wav"
|
|
|
650 |
traceback.print_exc()
|
651 |
raise e
|
652 |
|
653 |
+
@spaces.GPU()
|
654 |
def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
|
655 |
temp_ref_path = "wav/temp_ref.wav"
|
656 |
temp_timbre_path = "wav/temp_timbre.wav"
|
|
|
754 |
raise e
|
755 |
|
756 |
# 创建Gradio界面
|
757 |
+
with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
|
758 |
+
gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
|
759 |
+
# 添加链接标签行
|
760 |
+
with gr.Row(elem_id="links_row"):
|
761 |
+
gr.HTML("""
|
762 |
+
<div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
|
763 |
+
<a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
|
764 |
+
<img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
|
765 |
+
</a>
|
766 |
+
<a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
|
767 |
+
<img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
|
768 |
+
</a>
|
769 |
+
<a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
|
770 |
+
<img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
|
771 |
+
</a>
|
772 |
+
<a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
|
773 |
+
<img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
|
774 |
+
</a>
|
775 |
+
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
776 |
""")
|
777 |
|
778 |
+
with gr.Tab("Vevo-Timbre"):
|
779 |
+
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
780 |
+
with gr.Row():
|
781 |
+
with gr.Column():
|
782 |
+
timbre_content = gr.Audio(label="Source Audio", type="numpy")
|
783 |
+
timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
784 |
+
timbre_button = gr.Button("Generate")
|
785 |
+
with gr.Column():
|
786 |
+
timbre_output = gr.Audio(label="Result")
|
787 |
+
timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
|
788 |
+
|
789 |
+
with gr.Tab("Vevo-Style"):
|
790 |
+
gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
|
791 |
+
with gr.Row():
|
792 |
+
with gr.Column():
|
793 |
+
style_content = gr.Audio(label="Source Audio", type="numpy")
|
794 |
+
style_reference = gr.Audio(label="Style Reference", type="numpy")
|
795 |
+
style_button = gr.Button("Generate")
|
796 |
+
with gr.Column():
|
797 |
+
style_output = gr.Audio(label="Result")
|
798 |
+
style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
|
799 |
+
|
800 |
+
with gr.Tab("Vevo-Voice"):
|
801 |
+
gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
|
802 |
+
with gr.Row():
|
803 |
+
with gr.Column():
|
804 |
+
voice_content = gr.Audio(label="Source Audio", type="numpy")
|
805 |
+
voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
|
806 |
+
voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
807 |
+
voice_button = gr.Button("Generate")
|
808 |
+
with gr.Column():
|
809 |
+
voice_output = gr.Audio(label="Result")
|
810 |
+
voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
|
811 |
+
|
812 |
+
|
813 |
+
|
814 |
+
with gr.Tab("Vevo-TTS"):
|
815 |
+
gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
|
816 |
+
with gr.Row():
|
817 |
+
with gr.Column():
|
818 |
+
tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
|
819 |
+
tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
|
820 |
+
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
821 |
+
tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
|
822 |
+
tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
|
823 |
+
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
824 |
+
tts_button = gr.Button("Generate")
|
825 |
+
with gr.Column():
|
826 |
+
tts_output = gr.Audio(label="Result")
|
827 |
+
|
828 |
+
tts_button.click(
|
829 |
+
vevo_tts,
|
830 |
+
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
|
831 |
+
outputs=tts_output
|
832 |
+
)
|
833 |
+
|
834 |
+
gr.Markdown("""
|
835 |
+
## About VEVO
|
836 |
+
VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
|
837 |
+
1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
|
838 |
+
2. **Vevo-Timbre**: Maintains style but transfers timbre
|
839 |
+
3. **Vevo-Voice**: Transfers both style and timbre with separate references
|
840 |
+
4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
|
841 |
+
|
842 |
+
For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
|
843 |
+
""")
|
844 |
|
845 |
+
# 启动应用
|
846 |
+
demo.launch()
|