Spaces:

amphion
/

Vevo

Running on Zero

App Files Files Community

积极的屁孩 commited on 6 days ago

Commit

cba1c8b

1 Parent(s): 3b944a1

download before infer

Browse files

Files changed (1) hide show

app.py +310 -94

app.py CHANGED Viewed

@@ -12,6 +12,17 @@ import subprocess
 import re
 import spaces
 def install_espeak():
     """Detect and install espeak-ng dependency"""
     try:
@@ -150,6 +161,10 @@ from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wa
 # Download and setup config files
 def setup_configs():
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
@@ -175,6 +190,8 @@ def setup_configs():
                 subprocess.run(["cp", file_data, file_path])
             except Exception as e:
                 print(f"Error downloading config file {file}: {e}")
 setup_configs()
@@ -192,54 +209,102 @@ def get_pipeline(pipeline_type):
     # Initialize pipeline based on the required pipeline type
     if pipeline_type == "style" or pipeline_type == "voice":
         # Download Content Tokenizer
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["tokenizer/vq32/*"],
-        )
-        content_tokenizer_ckpt_path = os.path.join(
-            local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
-        )
         # Download Content-Style Tokenizer
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["tokenizer/vq8192/*"],
-        )
-        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
         # Download Autoregressive Transformer
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
-        )
-        ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
-        ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
         # Download Flow Matching Transformer
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
-        )
-        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
-        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
         # Download Vocoder
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["acoustic_modeling/Vocoder/*"],
-        )
-        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
-        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
         # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
@@ -256,33 +321,62 @@ def get_pipeline(pipeline_type):
     elif pipeline_type == "timbre":
         # Download Content-Style Tokenizer (only needed for timbre)
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["tokenizer/vq8192/*"],
-        )
-        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
         # Download Flow Matching Transformer
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
-        )
-        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
-        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
         # Download Vocoder
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["acoustic_modeling/Vocoder/*"],
-        )
-        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
-        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
         # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
@@ -296,43 +390,82 @@ def get_pipeline(pipeline_type):
     elif pipeline_type == "tts":
         # Download Content-Style Tokenizer
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["tokenizer/vq8192/*"],
-        )
-        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
         # Download Autoregressive Transformer (TTS specific)
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
-        )
-        ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
-        ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
         # Download Flow Matching Transformer
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
-        )
-        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
-        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
         # Download Vocoder
-        local_dir = snapshot_download(
-            repo_id="amphion/Vevo",
-            repo_type="model",
-            cache_dir="./ckpts/Vevo",
-            allow_patterns=["acoustic_modeling/Vocoder/*"],
-        )
-        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
-        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
         # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
@@ -761,6 +894,89 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
         traceback.print_exc()
         raise e
 # Create Gradio interface
 with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
     gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")

 import re
 import spaces
+# 创建一个全局变量来跟踪已下载的资源
+downloaded_resources = {
+    "configs": False,
+    "tokenizer_vq32": False,
+    "tokenizer_vq8192": False,
+    "ar_Vq32ToVq8192": False,
+    "ar_PhoneToVq8192": False,
+    "fmt_Vq8192ToMels": False,
+    "vocoder": False
+}
 def install_espeak():
     """Detect and install espeak-ng dependency"""
     try:
 # Download and setup config files
 def setup_configs():
+    if downloaded_resources["configs"]:
+        print("Config files already downloaded, skipping...")
+        return
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
                 subprocess.run(["cp", file_data, file_path])
             except Exception as e:
                 print(f"Error downloading config file {file}: {e}")
+    downloaded_resources["configs"] = True
 setup_configs()
     # Initialize pipeline based on the required pipeline type
     if pipeline_type == "style" or pipeline_type == "voice":
         # Download Content Tokenizer
+        content_tokenizer_ckpt_path = ""
+        if not downloaded_resources["tokenizer_vq32"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["tokenizer/vq32/*"],
+            )
+            content_tokenizer_ckpt_path = os.path.join(
+                local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
+            )
+            downloaded_resources["tokenizer_vq32"] = True
+            print("Downloaded Content Tokenizer (vq32)")
+        else:
+            print("Content Tokenizer (vq32) already downloaded, skipping...")
+            content_tokenizer_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq32/hubert_large_l18_c32.pkl"
+            )
         # Download Content-Style Tokenizer
+        content_style_tokenizer_ckpt_path = ""
+        if not downloaded_resources["tokenizer_vq8192"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["tokenizer/vq8192/*"],
+            )
+            content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+            downloaded_resources["tokenizer_vq8192"] = True
+            print("Downloaded Content-Style Tokenizer (vq8192)")
+        else:
+            print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
+            content_style_tokenizer_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
+            )
         # Download Autoregressive Transformer
+        ar_ckpt_path = ""
+        if not downloaded_resources["ar_Vq32ToVq8192"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
+            )
+            ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
+            ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
+            downloaded_resources["ar_Vq32ToVq8192"] = True
+            print("Downloaded Autoregressive Transformer (Vq32ToVq8192)")
+        else:
+            print("Autoregressive Transformer (Vq32ToVq8192) already downloaded, skipping...")
+            ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
+            ar_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "contentstyle_modeling/Vq32ToVq8192"
+            )
         # Download Flow Matching Transformer
+        fmt_ckpt_path = ""
+        if not downloaded_resources["fmt_Vq8192ToMels"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+            )
+            fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+            fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+            downloaded_resources["fmt_Vq8192ToMels"] = True
+            print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
+        else:
+            print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
+            fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+            fmt_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
+            )
         # Download Vocoder
+        vocoder_ckpt_path = ""
+        if not downloaded_resources["vocoder"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["acoustic_modeling/Vocoder/*"],
+            )
+            vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+            vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+            downloaded_resources["vocoder"] = True
+            print("Downloaded Vocoder")
+        else:
+            print("Vocoder already downloaded, skipping...")
+            vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+            vocoder_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
+            )
         # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
     elif pipeline_type == "timbre":
         # Download Content-Style Tokenizer (only needed for timbre)
+        content_style_tokenizer_ckpt_path = ""
+        if not downloaded_resources["tokenizer_vq8192"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["tokenizer/vq8192/*"],
+            )
+            content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+            downloaded_resources["tokenizer_vq8192"] = True
+            print("Downloaded Content-Style Tokenizer (vq8192)")
+        else:
+            print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
+            content_style_tokenizer_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
+            )
         # Download Flow Matching Transformer
+        fmt_ckpt_path = ""
+        if not downloaded_resources["fmt_Vq8192ToMels"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+            )
+            fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+            fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+            downloaded_resources["fmt_Vq8192ToMels"] = True
+            print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
+        else:
+            print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
+            fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+            fmt_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
+            )
         # Download Vocoder
+        vocoder_ckpt_path = ""
+        if not downloaded_resources["vocoder"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["acoustic_modeling/Vocoder/*"],
+            )
+            vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+            vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+            downloaded_resources["vocoder"] = True
+            print("Downloaded Vocoder")
+        else:
+            print("Vocoder already downloaded, skipping...")
+            vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+            vocoder_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
+            )
         # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
     elif pipeline_type == "tts":
         # Download Content-Style Tokenizer
+        content_style_tokenizer_ckpt_path = ""
+        if not downloaded_resources["tokenizer_vq8192"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["tokenizer/vq8192/*"],
+            )
+            content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+            downloaded_resources["tokenizer_vq8192"] = True
+            print("Downloaded Content-Style Tokenizer (vq8192)")
+        else:
+            print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
+            content_style_tokenizer_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
+            )
         # Download Autoregressive Transformer (TTS specific)
+        ar_ckpt_path = ""
+        if not downloaded_resources["ar_PhoneToVq8192"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
+            )
+            ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
+            ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
+            downloaded_resources["ar_PhoneToVq8192"] = True
+            print("Downloaded Autoregressive Transformer (PhoneToVq8192)")
+        else:
+            print("Autoregressive Transformer (PhoneToVq8192) already downloaded, skipping...")
+            ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
+            ar_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "contentstyle_modeling/PhoneToVq8192"
+            )
         # Download Flow Matching Transformer
+        fmt_ckpt_path = ""
+        if not downloaded_resources["fmt_Vq8192ToMels"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+            )
+            fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+            fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+            downloaded_resources["fmt_Vq8192ToMels"] = True
+            print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
+        else:
+            print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
+            fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+            fmt_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
+            )
         # Download Vocoder
+        vocoder_ckpt_path = ""
+        if not downloaded_resources["vocoder"]:
+            local_dir = snapshot_download(
+                repo_id="amphion/Vevo",
+                repo_type="model",
+                cache_dir="./ckpts/Vevo",
+                allow_patterns=["acoustic_modeling/Vocoder/*"],
+            )
+            vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+            vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+            downloaded_resources["vocoder"] = True
+            print("Downloaded Vocoder")
+        else:
+            print("Vocoder already downloaded, skipping...")
+            vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+            vocoder_ckpt_path = os.path.join(
+                "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
+            )
         # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
         traceback.print_exc()
         raise e
+# 在程序启动时下载所有需要的模型资源
+def preload_all_resources():
+    print("预加载所有模型资源...")
+    # 下载配置文件
+    setup_configs()
+    # 下载Content Tokenizer (vq32)
+    if not downloaded_resources["tokenizer_vq32"]:
+        print("预下载 Content Tokenizer (vq32)...")
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["tokenizer/vq32/*"],
+        )
+        downloaded_resources["tokenizer_vq32"] = True
+        print("Content Tokenizer (vq32) 下载完成")
+    # 下载Content-Style Tokenizer (vq8192)
+    if not downloaded_resources["tokenizer_vq8192"]:
+        print("预下载 Content-Style Tokenizer (vq8192)...")
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["tokenizer/vq8192/*"],
+        )
+        downloaded_resources["tokenizer_vq8192"] = True
+        print("Content-Style Tokenizer (vq8192) 下载完成")
+    # 下载Autoregressive Transformer (Vq32ToVq8192)
+    if not downloaded_resources["ar_Vq32ToVq8192"]:
+        print("预下载 Autoregressive Transformer (Vq32ToVq8192)...")
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
+        )
+        downloaded_resources["ar_Vq32ToVq8192"] = True
+        print("Autoregressive Transformer (Vq32ToVq8192) 下载完成")
+    # 下载Autoregressive Transformer (PhoneToVq8192)
+    if not downloaded_resources["ar_PhoneToVq8192"]:
+        print("预下载 Autoregressive Transformer (PhoneToVq8192)...")
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
+        )
+        downloaded_resources["ar_PhoneToVq8192"] = True
+        print("Autoregressive Transformer (PhoneToVq8192) 下载完成")
+    # 下载Flow Matching Transformer
+    if not downloaded_resources["fmt_Vq8192ToMels"]:
+        print("预下载 Flow Matching Transformer (Vq8192ToMels)...")
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+        )
+        downloaded_resources["fmt_Vq8192ToMels"] = True
+        print("Flow Matching Transformer (Vq8192ToMels) 下载完成")
+    # 下载Vocoder
+    if not downloaded_resources["vocoder"]:
+        print("预下载 Vocoder...")
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vocoder/*"],
+        )
+        downloaded_resources["vocoder"] = True
+        print("Vocoder 下载完成")
+    print("所有模型资源预加载完成！")
+# 在创建Gradio界面之前预加载所有资源
+preload_all_resources()
 # Create Gradio interface
 with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
     gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")