Spaces:

amphion
/

Vevo

Running on Zero

App Files Files Community

积极的屁孩 commited on 9 days ago

Commit

3b944a1

1 Parent(s): defde46

cn -> en

Browse files

Files changed (1) hide show

app.py +135 -135

app.py CHANGED Viewed

@@ -13,67 +13,67 @@ import re
 import spaces
 def install_espeak():
-    """检测并安装espeak-ng依赖"""
     try:
-        # 检查espeak-ng是否已安装
         result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
         if result.returncode != 0:
-            print("检测到系统中未安装espeak-ng，正在尝试安装...")
-            # 尝试使用apt-get安装espeak-ng及其数据
             subprocess.run(["apt-get", "update"], check=True)
-            # 安装 espeak-ng 和对应的语言数据包
             subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
-            print("espeak-ng及其数据包安装成功！")
         else:
-            print("espeak-ng已安装在系统中。")
-            # 即使已安装，也尝试更新数据确保完整性 (可选，但有时有帮助)
-            # print("尝试更新 espeak-ng 数据...")
             # subprocess.run(["apt-get", "update"], check=True)
             # subprocess.run(["apt-get", "install", "--only-upgrade", "-y", "espeak-ng-data"], check=True)
-        # 验证中文支持 (可选)
         try:
             voices_result = subprocess.run(["espeak-ng", "--voices=cmn"], capture_output=True, text=True, check=True)
             if "cmn" in voices_result.stdout:
-                print("espeak-ng 支持 'cmn' 语言。")
             else:
-                print("警告：espeak-ng 安装了，但 'cmn' 语言似乎仍不可用。")
         except Exception as e:
-             print(f"验证 espeak-ng 中文支持时出错（可能不影响功能）: {e}")
     except Exception as e:
-        print(f"安装espeak-ng时出错: {e}")
-        print("请尝试手动运行: apt-get update && apt-get install -y espeak-ng espeak-ng-data")
-# 在所有其他操作之前安装espeak
 install_espeak()
 def patch_langsegment_init():
     try:
-        # 尝试找到 LangSegment 包的位置
         spec = importlib.util.find_spec("LangSegment")
         if spec is None or spec.origin is None:
-            print("无法定位 LangSegment 包。")
             return
-        # 构建 __init__.py 的路径
         init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
         if not os.path.exists(init_path):
-            print(f"未找到 LangSegment 的 __init__.py 文件于: {init_path}")
-            # 尝试在 site-packages 中查找，适用于某些环境
             for site_pkg_path in site.getsitepackages():
                 potential_path = os.path.join(site_pkg_path, 'LangSegment', '__init__.py')
                 if os.path.exists(potential_path):
                     init_path = potential_path
-                    print(f"在 site-packages 中找到 __init__.py: {init_path}")
                     break
-            else: # 如果循环正常结束（没有 break）
-                 print(f"在 site-packages 中也未找到 __init__.py")
                  return
-        print(f"尝试读取 LangSegment __init__.py: {init_path}")
         with open(init_path, 'r') as f:
             lines = f.readlines()
@@ -85,52 +85,52 @@ def patch_langsegment_init():
             stripped_line = line.strip()
             if stripped_line.startswith(target_line_prefix):
                 if 'setLangfilters' in stripped_line or 'getLangfilters' in stripped_line:
-                    print(f"发现需要修改的行: {stripped_line}")
-                    # 移除 setLangfilters 和 getLangfilters
                     modified_line = stripped_line.replace(',setLangfilters', '')
                     modified_line = modified_line.replace(',getLangfilters', '')
-                    # 确保逗号处理正确 (例如，如果它们是末尾的项)
                     modified_line = modified_line.replace('setLangfilters,', '')
                     modified_line = modified_line.replace('getLangfilters,', '')
-                    # 如果它们是唯一的额外导入，移除可能多余的逗号
                     modified_line = modified_line.rstrip(',')
                     new_lines.append(modified_line + '\n')
                     modified = True
-                    print(f"修改后的行: {modified_line.strip()}")
                 else:
-                    new_lines.append(line) # 行没问题，保留原样
             else:
-                new_lines.append(line) # 非目标行，保留原样
         if modified:
-            print(f"尝试写回已修改的 LangSegment __init__.py 到: {init_path}")
             try:
                 with open(init_path, 'w') as f:
                     f.writelines(new_lines)
-                print("LangSegment __init__.py 修改成功。")
-                # 尝试重新加载模块以使更改生效（可能无效，取决于导入链）
                 try:
                     import LangSegment
                     importlib.reload(LangSegment)
-                    print("LangSegment 模块已尝试重新加载。")
                 except Exception as reload_e:
-                     print(f"重新加载 LangSegment 时出错（可能无影响）: {reload_e}")
             except PermissionError:
-                print(f"错误：权限不足，无法修改 {init_path}。请考虑修改 requirements.txt。")
             except Exception as write_e:
-                print(f"写入 LangSegment __init__.py 时发生其他错误: {write_e}")
         else:
-            print("LangSegment __init__.py 无需修改。")
     except ImportError:
-         print("未找到 LangSegment 包，无法进行修复。")
     except Exception as e:
-        print(f"修复 LangSegment 包时发生意外错误: {e}")
-# 在所有其他导入（尤其是可能触发 LangSegment 导入的 Amphion）之前执行修复
 patch_langsegment_init()
-# 克隆Amphion仓库
 if not os.path.exists("Amphion"):
     subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
     os.chdir("Amphion")
@@ -138,17 +138,17 @@ else:
     if not os.getcwd().endswith("Amphion"):
         os.chdir("Amphion")
-# 将Amphion加入到路径中
 if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
     sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
-# 确保需要的目录存在
 os.makedirs("wav", exist_ok=True)
 os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wav
-# 下载和设置配置文件
 def setup_configs():
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
@@ -171,27 +171,27 @@ def setup_configs():
                     repo_type="model",
                 )
                 os.makedirs(os.path.dirname(file_path), exist_ok=True)
-                # 拷贝文件到目标位置
                 subprocess.run(["cp", file_data, file_path])
             except Exception as e:
-                print(f"下载配置文件 {file} 时出错: {e}")
 setup_configs()
-# 设备配置
 device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-print(f"使用设备: {device}")
-# 初始化管道字典
 inference_pipelines = {}
 def get_pipeline(pipeline_type):
     if pipeline_type in inference_pipelines:
         return inference_pipelines[pipeline_type]
-    # 根据需要的管道类型初始化
     if pipeline_type == "style" or pipeline_type == "voice":
-        # 下载Content Tokenizer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -202,7 +202,7 @@ def get_pipeline(pipeline_type):
             local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
         )
-        # 下载Content-Style Tokenizer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -211,7 +211,7 @@ def get_pipeline(pipeline_type):
         )
         content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
-        # 下载Autoregressive Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -221,7 +221,7 @@ def get_pipeline(pipeline_type):
         ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
         ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
-        # 下载Flow Matching Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -231,7 +231,7 @@ def get_pipeline(pipeline_type):
         fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
         fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
-        # 下载Vocoder
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -241,7 +241,7 @@ def get_pipeline(pipeline_type):
         vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
         vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
-        # 初始化管道
         inference_pipeline = VevoInferencePipeline(
             content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
             content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
@@ -255,7 +255,7 @@ def get_pipeline(pipeline_type):
         )
     elif pipeline_type == "timbre":
-        # 下载Content-Style Tokenizer (仅timbre需要)
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -264,7 +264,7 @@ def get_pipeline(pipeline_type):
         )
         content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
-        # 下载Flow Matching Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -274,7 +274,7 @@ def get_pipeline(pipeline_type):
         fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
         fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
-        # 下载Vocoder
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -284,7 +284,7 @@ def get_pipeline(pipeline_type):
         vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
         vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
-        # 初始化管道
         inference_pipeline = VevoInferencePipeline(
             content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
             fmt_cfg_path=fmt_cfg_path,
@@ -295,7 +295,7 @@ def get_pipeline(pipeline_type):
         )
     elif pipeline_type == "tts":
-        # 下载Content-Style Tokenizer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -304,7 +304,7 @@ def get_pipeline(pipeline_type):
         )
         content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
-        # 下载Autoregressive Transformer (TTS特有)
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -314,7 +314,7 @@ def get_pipeline(pipeline_type):
         ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
         ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
-        # 下载Flow Matching Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -324,7 +324,7 @@ def get_pipeline(pipeline_type):
         fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
         fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
-        # 下载Vocoder
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
@@ -334,7 +334,7 @@ def get_pipeline(pipeline_type):
         vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
         vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
-        # 初始化管道
         inference_pipeline = VevoInferencePipeline(
             content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
             ar_cfg_path=ar_cfg_path,
@@ -346,33 +346,33 @@ def get_pipeline(pipeline_type):
             device=device,
         )
-    # 缓存管道实例
     inference_pipelines[pipeline_type] = inference_pipeline
     return inference_pipeline
-# 实现VEVO功能函数
 @spaces.GPU()
 def vevo_style(content_wav, style_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
     output_path = "wav/output_vevostyle.wav"
-    # 检查并处理音频数据
     if content_wav is None or style_wav is None:
         raise ValueError("Please upload audio files")
-    # 处理音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         if isinstance(content_wav[0], np.ndarray):
             content_data, content_sr = content_wav
         else:
             content_sr, content_data = content_wav
-        # 确保是单声道
         if len(content_data.shape) > 1 and content_data.shape[1] > 1:
             content_data = np.mean(content_data, axis=1)
-        # 重采样到24kHz
         if content_sr != 24000:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
@@ -380,7 +380,7 @@ def vevo_style(content_wav, style_wav):
         else:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-        # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid content audio format")
@@ -390,11 +390,11 @@ def vevo_style(content_wav, style_wav):
     else:
         style_sr, style_data = style_wav
-    # 确保是单声道
     if len(style_data.shape) > 1 and style_data.shape[1] > 1:
         style_data = np.mean(style_data, axis=1)
-    # 重采样到24kHz
     if style_sr != 24000:
         style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
         style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
@@ -402,22 +402,22 @@ def vevo_style(content_wav, style_wav):
     else:
         style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
-    # 归一化音量
     style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
-    # 打印debug信息
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
-    # 保存音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
     torchaudio.save(temp_style_path, style_tensor, style_sr)
     try:
-        # 获取管道
         pipeline = get_pipeline("style")
-        # 推理
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=temp_content_path,
             src_text=None,
@@ -425,14 +425,14 @@ def vevo_style(content_wav, style_wav):
             timbre_ref_wav_path=temp_content_path,
         )
-        # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
-        # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
@@ -448,22 +448,22 @@ def vevo_timbre(content_wav, reference_wav):
     temp_reference_path = "wav/temp_reference.wav"
     output_path = "wav/output_vevotimbre.wav"
-    # 检查并处理音频数据
     if content_wav is None or reference_wav is None:
         raise ValueError("Please upload audio files")
-    # 处理内容音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         if isinstance(content_wav[0], np.ndarray):
             content_data, content_sr = content_wav
         else:
             content_sr, content_data = content_wav
-        # 确保是单声道
         if len(content_data.shape) > 1 and content_data.shape[1] > 1:
             content_data = np.mean(content_data, axis=1)
-        # 重采样到24kHz
         if content_sr != 24000:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
@@ -471,23 +471,23 @@ def vevo_timbre(content_wav, reference_wav):
         else:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-        # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid content audio format")
-    # 处理参考音频格式
     if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
         if isinstance(reference_wav[0], np.ndarray):
             reference_data, reference_sr = reference_wav
         else:
             reference_sr, reference_data = reference_wav
-        # 确保是单声道
         if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
             reference_data = np.mean(reference_data, axis=1)
-        # 重采样到24kHz
         if reference_sr != 24000:
             reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
             reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
@@ -495,38 +495,38 @@ def vevo_timbre(content_wav, reference_wav):
         else:
             reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
-        # 归一化音量
         reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid reference audio format")
-    # 打印debug信息
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
-    # 保存上传的音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
     torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
     try:
-        # 获取管道
         pipeline = get_pipeline("timbre")
-        # 推理
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
             flow_matching_steps=32,
         )
-        # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
-        # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
@@ -543,22 +543,22 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
     temp_timbre_path = "wav/temp_timbre.wav"
     output_path = "wav/output_vevovoice.wav"
-    # 检查并处理音频数据
     if content_wav is None or style_reference_wav is None or timbre_reference_wav is None:
         raise ValueError("Please upload all required audio files")
-    # 处理内容音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         if isinstance(content_wav[0], np.ndarray):
             content_data, content_sr = content_wav
         else:
             content_sr, content_data = content_wav
-        # 确保是单声道
         if len(content_data.shape) > 1 and content_data.shape[1] > 1:
             content_data = np.mean(content_data, axis=1)
-        # 重采样到24kHz
         if content_sr != 24000:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
@@ -566,23 +566,23 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
         else:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-        # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid content audio format")
-    # 处理风格参考音频格式
     if isinstance(style_reference_wav, tuple) and len(style_reference_wav) == 2:
         if isinstance(style_reference_wav[0], np.ndarray):
             style_data, style_sr = style_reference_wav
         else:
             style_sr, style_data = style_reference_wav
-        # 确保是单声道
         if len(style_data.shape) > 1 and style_data.shape[1] > 1:
             style_data = np.mean(style_data, axis=1)
-        # 重采样到24kHz
         if style_sr != 24000:
             style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
             style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
@@ -590,23 +590,23 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
         else:
             style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
-        # 归一化音量
         style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid style reference audio format")
-    # 处理音色参考音频格式
     if isinstance(timbre_reference_wav, tuple) and len(timbre_reference_wav) == 2:
         if isinstance(timbre_reference_wav[0], np.ndarray):
             timbre_data, timbre_sr = timbre_reference_wav
         else:
             timbre_sr, timbre_data = timbre_reference_wav
-        # 确保是单声道
         if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
             timbre_data = np.mean(timbre_data, axis=1)
-        # 重采样到24kHz
         if timbre_sr != 24000:
             timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
             timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
@@ -614,26 +614,26 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
         else:
             timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
-        # 归一化音量
         timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid timbre reference audio format")
-    # 打印debug信息
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
     print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
-    # 保存上传的音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
     torchaudio.save(temp_style_path, style_tensor, style_sr)
     torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
     try:
-        # 获取管道
         pipeline = get_pipeline("voice")
-        # 推理
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=temp_content_path,
             src_text=None,
@@ -641,14 +641,14 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
             timbre_ref_wav_path=temp_timbre_path,
         )
-        # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
-        # 保存生成��音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
@@ -664,22 +664,22 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
     temp_timbre_path = "wav/temp_timbre.wav"
     output_path = "wav/output_vevotts.wav"
-    # 检查并处理音频数据
     if ref_wav is None:
         raise ValueError("Please upload a reference audio file")
-    # 处理参考音频格式
     if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
         if isinstance(ref_wav[0], np.ndarray):
             ref_data, ref_sr = ref_wav
         else:
             ref_sr, ref_data = ref_wav
-        # 确保是单声道
         if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
             ref_data = np.mean(ref_data, axis=1)
-        # 重采样到24kHz
         if ref_sr != 24000:
             ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
@@ -687,17 +687,17 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
         else:
             ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
-        # 归一化音量
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid reference audio format")
-    # 打印debug信息
     print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
     if style_ref_text:
         print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
-    # 保存上传的音频
     torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
     if timbre_ref_wav is not None:
@@ -707,11 +707,11 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
             else:
                 timbre_sr, timbre_data = timbre_ref_wav
-            # 确保是单声道
             if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
                 timbre_data = np.mean(timbre_data, axis=1)
-            # 重采样到24kHz
             if timbre_sr != 24000:
                 timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
                 timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
@@ -719,7 +719,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
             else:
                 timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
-            # 归一化音量
             timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
             print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
@@ -730,10 +730,10 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
         temp_timbre_path = temp_ref_path
     try:
-        # 获取管道
         pipeline = get_pipeline("tts")
-        # 推理
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=None,
             src_text=text,
@@ -744,14 +744,14 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
             style_ref_wav_text_language=style_ref_text_language,
         )
-        # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
-        # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
@@ -761,10 +761,10 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
         traceback.print_exc()
         raise e
-# 创建Gradio界面
 with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
     gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
-    # 添加链接标签行
     with gr.Row(elem_id="links_row"):
         gr.HTML("""
         <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
@@ -850,5 +850,5 @@ with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Sup
     For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
     """)
-# 启动应用
 demo.launch()

 import spaces
 def install_espeak():
+    """Detect and install espeak-ng dependency"""
     try:
+        # Check if espeak-ng is already installed
         result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
         if result.returncode != 0:
+            print("Detected espeak-ng not installed in the system, attempting to install...")
+            # Try to install espeak-ng and its data using apt-get
             subprocess.run(["apt-get", "update"], check=True)
+            # Install espeak-ng and the corresponding language data package
             subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
+            print("espeak-ng and its data packages installed successfully!")
         else:
+            print("espeak-ng is already installed in the system.")
+            # Even if already installed, try to update data to ensure integrity (optional but sometimes helpful)
+            # print("Attempting to update espeak-ng data...")
             # subprocess.run(["apt-get", "update"], check=True)
             # subprocess.run(["apt-get", "install", "--only-upgrade", "-y", "espeak-ng-data"], check=True)
+        # Verify Chinese support (optional)
         try:
             voices_result = subprocess.run(["espeak-ng", "--voices=cmn"], capture_output=True, text=True, check=True)
             if "cmn" in voices_result.stdout:
+                print("espeak-ng supports 'cmn' language.")
             else:
+                print("Warning: espeak-ng is installed, but 'cmn' language still seems unavailable.")
         except Exception as e:
+             print(f"Error verifying espeak-ng Chinese support (may not affect functionality): {e}")
     except Exception as e:
+        print(f"Error installing espeak-ng: {e}")
+        print("Please try to run manually: apt-get update && apt-get install -y espeak-ng espeak-ng-data")
+# Install espeak before all other operations
 install_espeak()
 def patch_langsegment_init():
     try:
+        # Try to find the location of the LangSegment package
         spec = importlib.util.find_spec("LangSegment")
         if spec is None or spec.origin is None:
+            print("Unable to locate LangSegment package.")
             return
+        # Build the path to __init__.py
         init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
         if not os.path.exists(init_path):
+            print(f"LangSegment __init__.py file not found at: {init_path}")
+            # Try to find in site-packages, applicable in some environments
             for site_pkg_path in site.getsitepackages():
                 potential_path = os.path.join(site_pkg_path, 'LangSegment', '__init__.py')
                 if os.path.exists(potential_path):
                     init_path = potential_path
+                    print(f"Found __init__.py in site-packages: {init_path}")
                     break
+            else: # If the loop ends normally (no break)
+                 print(f"Also unable to find __init__.py in site-packages")
                  return
+        print(f"Attempting to read LangSegment __init__.py: {init_path}")
         with open(init_path, 'r') as f:
             lines = f.readlines()
             stripped_line = line.strip()
             if stripped_line.startswith(target_line_prefix):
                 if 'setLangfilters' in stripped_line or 'getLangfilters' in stripped_line:
+                    print(f"Found line that needs modification: {stripped_line}")
+                    # Remove setLangfilters and getLangfilters
                     modified_line = stripped_line.replace(',setLangfilters', '')
                     modified_line = modified_line.replace(',getLangfilters', '')
+                    # Ensure comma handling is correct (e.g., if they are the last items)
                     modified_line = modified_line.replace('setLangfilters,', '')
                     modified_line = modified_line.replace('getLangfilters,', '')
+                    # If they are the only extra imports, remove any redundant commas
                     modified_line = modified_line.rstrip(',')
                     new_lines.append(modified_line + '\n')
                     modified = True
+                    print(f"Modified line: {modified_line.strip()}")
                 else:
+                    new_lines.append(line) # Line is fine, keep as is
             else:
+                new_lines.append(line) # Non-target line, keep as is
         if modified:
+            print(f"Attempting to write back modified LangSegment __init__.py to: {init_path}")
             try:
                 with open(init_path, 'w') as f:
                     f.writelines(new_lines)
+                print("LangSegment __init__.py modified successfully.")
+                # Try to reload the module to make changes effective (may not work, depending on import chain)
                 try:
                     import LangSegment
                     importlib.reload(LangSegment)
+                    print("LangSegment module has been attempted to reload.")
                 except Exception as reload_e:
+                     print(f"Error reloading LangSegment (may have no impact): {reload_e}")
             except PermissionError:
+                print(f"Error: Insufficient permissions to modify {init_path}. Consider modifying requirements.txt.")
             except Exception as write_e:
+                print(f"Other error occurred when writing LangSegment __init__.py: {write_e}")
         else:
+            print("LangSegment __init__.py doesn't need modification.")
     except ImportError:
+         print("LangSegment package not found, unable to fix.")
     except Exception as e:
+        print(f"Unexpected error occurred when fixing LangSegment package: {e}")
+# Execute the fix before all other imports (especially Amphion) that might trigger LangSegment
 patch_langsegment_init()
+# Clone Amphion repository
 if not os.path.exists("Amphion"):
     subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
     os.chdir("Amphion")
     if not os.getcwd().endswith("Amphion"):
         os.chdir("Amphion")
+# Add Amphion to the path
 if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
     sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
+# Ensure needed directories exist
 os.makedirs("wav", exist_ok=True)
 os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wav
+# Download and setup config files
 def setup_configs():
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
                     repo_type="model",
                 )
                 os.makedirs(os.path.dirname(file_path), exist_ok=True)
+                # Copy file to target location
                 subprocess.run(["cp", file_data, file_path])
             except Exception as e:
+                print(f"Error downloading config file {file}: {e}")
 setup_configs()
+# Device configuration
 device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+print(f"Using device: {device}")
+# Initialize pipeline dictionary
 inference_pipelines = {}
 def get_pipeline(pipeline_type):
     if pipeline_type in inference_pipelines:
         return inference_pipelines[pipeline_type]
+    # Initialize pipeline based on the required pipeline type
     if pipeline_type == "style" or pipeline_type == "voice":
+        # Download Content Tokenizer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
             local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
         )
+        # Download Content-Style Tokenizer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         )
         content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+        # Download Autoregressive Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
         ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
+        # Download Flow Matching Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
         fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+        # Download Vocoder
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
         vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+        # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
             content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
             content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
         )
     elif pipeline_type == "timbre":
+        # Download Content-Style Tokenizer (only needed for timbre)
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         )
         content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+        # Download Flow Matching Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
         fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+        # Download Vocoder
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
         vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+        # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
             content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
             fmt_cfg_path=fmt_cfg_path,
         )
     elif pipeline_type == "tts":
+        # Download Content-Style Tokenizer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         )
         content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+        # Download Autoregressive Transformer (TTS specific)
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
         ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
+        # Download Flow Matching Transformer
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
         fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+        # Download Vocoder
         local_dir = snapshot_download(
             repo_id="amphion/Vevo",
             repo_type="model",
         vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
         vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+        # Initialize pipeline
         inference_pipeline = VevoInferencePipeline(
             content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
             ar_cfg_path=ar_cfg_path,
             device=device,
         )
+    # Cache pipeline instance
     inference_pipelines[pipeline_type] = inference_pipeline
     return inference_pipeline
+# Implement VEVO functionality functions
 @spaces.GPU()
 def vevo_style(content_wav, style_wav):
     temp_content_path = "wav/temp_content.wav"
     temp_style_path = "wav/temp_style.wav"
     output_path = "wav/output_vevostyle.wav"
+    # Check and process audio data
     if content_wav is None or style_wav is None:
         raise ValueError("Please upload audio files")
+    # Process audio format
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         if isinstance(content_wav[0], np.ndarray):
             content_data, content_sr = content_wav
         else:
             content_sr, content_data = content_wav
+        # Ensure single channel
         if len(content_data.shape) > 1 and content_data.shape[1] > 1:
             content_data = np.mean(content_data, axis=1)
+        # Resample to 24kHz
         if content_sr != 24000:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
         else:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        # Normalize volume
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid content audio format")
     else:
         style_sr, style_data = style_wav
+    # Ensure single channel
     if len(style_data.shape) > 1 and style_data.shape[1] > 1:
         style_data = np.mean(style_data, axis=1)
+    # Resample to 24kHz
     if style_sr != 24000:
         style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
         style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
     else:
         style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
+    # Normalize volume
     style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
+    # Print debug information
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
+    # Save audio
     torchaudio.save(temp_content_path, content_tensor, content_sr)
     torchaudio.save(temp_style_path, style_tensor, style_sr)
     try:
+        # Get pipeline
         pipeline = get_pipeline("style")
+        # Inference
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=temp_content_path,
             src_text=None,
             timbre_ref_wav_path=temp_content_path,
         )
+        # Check if generated audio is numerical anomaly
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
+        # Save generated audio
         save_audio(gen_audio, output_path=output_path)
         return output_path
     temp_reference_path = "wav/temp_reference.wav"
     output_path = "wav/output_vevotimbre.wav"
+    # Check and process audio data
     if content_wav is None or reference_wav is None:
         raise ValueError("Please upload audio files")
+    # Process content audio format
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         if isinstance(content_wav[0], np.ndarray):
             content_data, content_sr = content_wav
         else:
             content_sr, content_data = content_wav
+        # Ensure single channel
         if len(content_data.shape) > 1 and content_data.shape[1] > 1:
             content_data = np.mean(content_data, axis=1)
+        # Resample to 24kHz
         if content_sr != 24000:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
         else:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        # Normalize volume
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid content audio format")
+    # Process reference audio format
     if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
         if isinstance(reference_wav[0], np.ndarray):
             reference_data, reference_sr = reference_wav
         else:
             reference_sr, reference_data = reference_wav
+        # Ensure single channel
         if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
             reference_data = np.mean(reference_data, axis=1)
+        # Resample to 24kHz
         if reference_sr != 24000:
             reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
             reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
         else:
             reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
+        # Normalize volume
         reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid reference audio format")
+    # Print debug information
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
+    # Save uploaded audio
     torchaudio.save(temp_content_path, content_tensor, content_sr)
     torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
     try:
+        # Get pipeline
         pipeline = get_pipeline("timbre")
+        # Inference
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
             flow_matching_steps=32,
         )
+        # Check if generated audio is numerical anomaly
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
+        # Save generated audio
         save_audio(gen_audio, output_path=output_path)
         return output_path
     temp_timbre_path = "wav/temp_timbre.wav"
     output_path = "wav/output_vevovoice.wav"
+    # Check and process audio data
     if content_wav is None or style_reference_wav is None or timbre_reference_wav is None:
         raise ValueError("Please upload all required audio files")
+    # Process content audio format
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         if isinstance(content_wav[0], np.ndarray):
             content_data, content_sr = content_wav
         else:
             content_sr, content_data = content_wav
+        # Ensure single channel
         if len(content_data.shape) > 1 and content_data.shape[1] > 1:
             content_data = np.mean(content_data, axis=1)
+        # Resample to 24kHz
         if content_sr != 24000:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
         else:
             content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        # Normalize volume
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid content audio format")
+    # Process style reference audio format
     if isinstance(style_reference_wav, tuple) and len(style_reference_wav) == 2:
         if isinstance(style_reference_wav[0], np.ndarray):
             style_data, style_sr = style_reference_wav
         else:
             style_sr, style_data = style_reference_wav
+        # Ensure single channel
         if len(style_data.shape) > 1 and style_data.shape[1] > 1:
             style_data = np.mean(style_data, axis=1)
+        # Resample to 24kHz
         if style_sr != 24000:
             style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
             style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
         else:
             style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
+        # Normalize volume
         style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid style reference audio format")
+    # Process timbre reference audio format
     if isinstance(timbre_reference_wav, tuple) and len(timbre_reference_wav) == 2:
         if isinstance(timbre_reference_wav[0], np.ndarray):
             timbre_data, timbre_sr = timbre_reference_wav
         else:
             timbre_sr, timbre_data = timbre_reference_wav
+        # Ensure single channel
         if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
             timbre_data = np.mean(timbre_data, axis=1)
+        # Resample to 24kHz
         if timbre_sr != 24000:
             timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
             timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
         else:
             timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
+        # Normalize volume
         timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid timbre reference audio format")
+    # Print debug information
     print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
     print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
     print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
+    # Save uploaded audio
     torchaudio.save(temp_content_path, content_tensor, content_sr)
     torchaudio.save(temp_style_path, style_tensor, style_sr)
     torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
     try:
+        # Get pipeline
         pipeline = get_pipeline("voice")
+        # Inference
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=temp_content_path,
             src_text=None,
             timbre_ref_wav_path=temp_timbre_path,
         )
+        # Check if generated audio is numerical anomaly
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
+        # Save generated audio
         save_audio(gen_audio, output_path=output_path)
         return output_path
     temp_timbre_path = "wav/temp_timbre.wav"
     output_path = "wav/output_vevotts.wav"
+    # Check and process audio data
     if ref_wav is None:
         raise ValueError("Please upload a reference audio file")
+    # Process reference audio format
     if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
         if isinstance(ref_wav[0], np.ndarray):
             ref_data, ref_sr = ref_wav
         else:
             ref_sr, ref_data = ref_wav
+        # Ensure single channel
         if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
             ref_data = np.mean(ref_data, axis=1)
+        # Resample to 24kHz
         if ref_sr != 24000:
             ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
         else:
             ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
+        # Normalize volume
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
     else:
         raise ValueError("Invalid reference audio format")
+    # Print debug information
     print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
     if style_ref_text:
         print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
+    # Save uploaded audio
     torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
     if timbre_ref_wav is not None:
             else:
                 timbre_sr, timbre_data = timbre_ref_wav
+            # Ensure single channel
             if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
                 timbre_data = np.mean(timbre_data, axis=1)
+            # Resample to 24kHz
             if timbre_sr != 24000:
                 timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
                 timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
             else:
                 timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
+            # Normalize volume
             timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
             print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
         temp_timbre_path = temp_ref_path
     try:
+        # Get pipeline
         pipeline = get_pipeline("tts")
+        # Inference
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=None,
             src_text=text,
             style_ref_wav_text_language=style_ref_text_language,
         )
+        # Check if generated audio is numerical anomaly
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
         print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
+        # Save generated audio
         save_audio(gen_audio, output_path=output_path)
         return output_path
         traceback.print_exc()
         raise e
+# Create Gradio interface
 with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
     gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
+    # Add link tag line
     with gr.Row(elem_id="links_row"):
         gr.HTML("""
         <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
     For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
     """)
+# Launch application
 demo.launch()