积极的屁孩 commited on
Commit
a8377f8
·
1 Parent(s): 980462e
Files changed (1) hide show
  1. app.py +92 -93
app.py CHANGED
@@ -5,12 +5,12 @@ import site
5
  import json
6
  import torch
7
  import gradio as gr
8
- import gradio.spaces as spaces
9
  import torchaudio
10
  import numpy as np
11
  from huggingface_hub import snapshot_download, hf_hub_download
12
  import subprocess
13
  import re
 
14
 
15
  def install_espeak():
16
  """检测并安装espeak-ng依赖"""
@@ -351,6 +351,7 @@ def get_pipeline(pipeline_type):
351
  return inference_pipeline
352
 
353
  # 实现VEVO功能函数
 
354
  def vevo_style(content_wav, style_wav):
355
  temp_content_path = "wav/temp_content.wav"
356
  temp_style_path = "wav/temp_style.wav"
@@ -433,6 +434,7 @@ def vevo_style(content_wav, style_wav):
433
  traceback.print_exc()
434
  raise e
435
 
 
436
  def vevo_timbre(content_wav, reference_wav):
437
  temp_content_path = "wav/temp_content.wav"
438
  temp_reference_path = "wav/temp_reference.wav"
@@ -526,6 +528,7 @@ def vevo_timbre(content_wav, reference_wav):
526
  traceback.print_exc()
527
  raise e
528
 
 
529
  def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
530
  temp_content_path = "wav/temp_content.wav"
531
  temp_style_path = "wav/temp_style.wav"
@@ -647,6 +650,7 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
647
  traceback.print_exc()
648
  raise e
649
 
 
650
  def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
651
  temp_ref_path = "wav/temp_ref.wav"
652
  temp_timbre_path = "wav/temp_timbre.wav"
@@ -750,98 +754,93 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
750
  raise e
751
 
752
  # 创建Gradio界面
753
- @spaces.GPU
754
- def run_app():
755
- with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
756
- gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
757
- # 添加链接标签行
758
- with gr.Row(elem_id="links_row"):
759
- gr.HTML("""
760
- <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
761
- <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
762
- <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
763
- </a>
764
- <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
765
- <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
766
- </a>
767
- <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
768
- <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
769
- </a>
770
- <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
771
- <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
772
- </a>
773
- </div>
774
- """)
775
-
776
- with gr.Tab("Vevo-Timbre"):
777
- gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
778
- with gr.Row():
779
- with gr.Column():
780
- timbre_content = gr.Audio(label="Source Audio", type="numpy")
781
- timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
782
- timbre_button = gr.Button("Generate")
783
- with gr.Column():
784
- timbre_output = gr.Audio(label="Result")
785
- timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
786
-
787
- with gr.Tab("Vevo-Style"):
788
- gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
789
- with gr.Row():
790
- with gr.Column():
791
- style_content = gr.Audio(label="Source Audio", type="numpy")
792
- style_reference = gr.Audio(label="Style Reference", type="numpy")
793
- style_button = gr.Button("Generate")
794
- with gr.Column():
795
- style_output = gr.Audio(label="Result")
796
- style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
797
-
798
- with gr.Tab("Vevo-Voice"):
799
- gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
800
- with gr.Row():
801
- with gr.Column():
802
- voice_content = gr.Audio(label="Source Audio", type="numpy")
803
- voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
804
- voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
805
- voice_button = gr.Button("Generate")
806
- with gr.Column():
807
- voice_output = gr.Audio(label="Result")
808
- voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
809
-
810
-
811
-
812
- with gr.Tab("Vevo-TTS"):
813
- gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
814
- with gr.Row():
815
- with gr.Column():
816
- tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
817
- tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
818
- tts_reference = gr.Audio(label="Style Reference", type="numpy")
819
- tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
820
- tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
821
- tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
822
- tts_button = gr.Button("Generate")
823
- with gr.Column():
824
- tts_output = gr.Audio(label="Result")
825
-
826
- tts_button.click(
827
- vevo_tts,
828
- inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
829
- outputs=tts_output
830
- )
831
-
832
- gr.Markdown("""
833
- ## About VEVO
834
- VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
835
- 1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
836
- 2. **Vevo-Timbre**: Maintains style but transfers timbre
837
- 3. **Vevo-Voice**: Transfers both style and timbre with separate references
838
- 4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
839
-
840
- For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
841
  """)
842
 
843
- # 启动应用
844
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845
 
846
- # Run the app
847
- run_app()
 
5
  import json
6
  import torch
7
  import gradio as gr
 
8
  import torchaudio
9
  import numpy as np
10
  from huggingface_hub import snapshot_download, hf_hub_download
11
  import subprocess
12
  import re
13
+ import spaces
14
 
15
  def install_espeak():
16
  """检测并安装espeak-ng依赖"""
 
351
  return inference_pipeline
352
 
353
  # 实现VEVO功能函数
354
+ @spaces.GPU()
355
  def vevo_style(content_wav, style_wav):
356
  temp_content_path = "wav/temp_content.wav"
357
  temp_style_path = "wav/temp_style.wav"
 
434
  traceback.print_exc()
435
  raise e
436
 
437
+ @spaces.GPU()
438
  def vevo_timbre(content_wav, reference_wav):
439
  temp_content_path = "wav/temp_content.wav"
440
  temp_reference_path = "wav/temp_reference.wav"
 
528
  traceback.print_exc()
529
  raise e
530
 
531
+ @spaces.GPU()
532
  def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
533
  temp_content_path = "wav/temp_content.wav"
534
  temp_style_path = "wav/temp_style.wav"
 
650
  traceback.print_exc()
651
  raise e
652
 
653
+ @spaces.GPU()
654
  def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
655
  temp_ref_path = "wav/temp_ref.wav"
656
  temp_timbre_path = "wav/temp_timbre.wav"
 
754
  raise e
755
 
756
  # 创建Gradio界面
757
+ with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
758
+ gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
759
+ # 添加链接标签行
760
+ with gr.Row(elem_id="links_row"):
761
+ gr.HTML("""
762
+ <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
763
+ <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
764
+ <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
765
+ </a>
766
+ <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
767
+ <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
768
+ </a>
769
+ <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
770
+ <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
771
+ </a>
772
+ <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
773
+ <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
774
+ </a>
775
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
  """)
777
 
778
+ with gr.Tab("Vevo-Timbre"):
779
+ gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
780
+ with gr.Row():
781
+ with gr.Column():
782
+ timbre_content = gr.Audio(label="Source Audio", type="numpy")
783
+ timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
784
+ timbre_button = gr.Button("Generate")
785
+ with gr.Column():
786
+ timbre_output = gr.Audio(label="Result")
787
+ timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
788
+
789
+ with gr.Tab("Vevo-Style"):
790
+ gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
791
+ with gr.Row():
792
+ with gr.Column():
793
+ style_content = gr.Audio(label="Source Audio", type="numpy")
794
+ style_reference = gr.Audio(label="Style Reference", type="numpy")
795
+ style_button = gr.Button("Generate")
796
+ with gr.Column():
797
+ style_output = gr.Audio(label="Result")
798
+ style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
799
+
800
+ with gr.Tab("Vevo-Voice"):
801
+ gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
802
+ with gr.Row():
803
+ with gr.Column():
804
+ voice_content = gr.Audio(label="Source Audio", type="numpy")
805
+ voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
806
+ voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
807
+ voice_button = gr.Button("Generate")
808
+ with gr.Column():
809
+ voice_output = gr.Audio(label="Result")
810
+ voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
811
+
812
+
813
+
814
+ with gr.Tab("Vevo-TTS"):
815
+ gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
816
+ with gr.Row():
817
+ with gr.Column():
818
+ tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
819
+ tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
820
+ tts_reference = gr.Audio(label="Style Reference", type="numpy")
821
+ tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
822
+ tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
823
+ tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
824
+ tts_button = gr.Button("Generate")
825
+ with gr.Column():
826
+ tts_output = gr.Audio(label="Result")
827
+
828
+ tts_button.click(
829
+ vevo_tts,
830
+ inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
831
+ outputs=tts_output
832
+ )
833
+
834
+ gr.Markdown("""
835
+ ## About VEVO
836
+ VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
837
+ 1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
838
+ 2. **Vevo-Timbre**: Maintains style but transfers timbre
839
+ 3. **Vevo-Voice**: Transfers both style and timbre with separate references
840
+ 4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
841
+
842
+ For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
843
+ """)
844
 
845
+ # 启动应用
846
+ demo.launch()