Spaces:
Running
on
Zero
Running
on
Zero
积极的屁孩
commited on
Commit
·
9755f3f
1
Parent(s):
2fc31e9
fix path
Browse files
app.py
CHANGED
@@ -203,15 +203,124 @@ print(f"Using device: {device}")
|
|
203 |
# Initialize pipeline dictionary
|
204 |
inference_pipelines = {}
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
def get_pipeline(pipeline_type):
|
207 |
if pipeline_type in inference_pipelines:
|
208 |
return inference_pipelines[pipeline_type]
|
209 |
|
210 |
# Initialize pipeline based on the required pipeline type
|
211 |
if pipeline_type == "style" or pipeline_type == "voice":
|
212 |
-
#
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
215 |
local_dir = snapshot_download(
|
216 |
repo_id="amphion/Vevo",
|
217 |
repo_type="model",
|
@@ -221,17 +330,14 @@ def get_pipeline(pipeline_type):
|
|
221 |
content_tokenizer_ckpt_path = os.path.join(
|
222 |
local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
|
223 |
)
|
224 |
-
downloaded_resources["tokenizer_vq32"] = True
|
225 |
-
print("Downloaded Content Tokenizer (vq32)")
|
226 |
-
else:
|
227 |
-
print("Content Tokenizer (vq32) already downloaded, skipping...")
|
228 |
-
content_tokenizer_ckpt_path = os.path.join(
|
229 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq32/hubert_large_l18_c32.pkl"
|
230 |
-
)
|
231 |
|
232 |
-
#
|
233 |
-
|
234 |
-
|
|
|
|
|
|
|
|
|
235 |
local_dir = snapshot_download(
|
236 |
repo_id="amphion/Vevo",
|
237 |
repo_type="model",
|
@@ -239,73 +345,54 @@ def get_pipeline(pipeline_type):
|
|
239 |
allow_patterns=["tokenizer/vq8192/*"],
|
240 |
)
|
241 |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
|
242 |
-
downloaded_resources["tokenizer_vq8192"] = True
|
243 |
-
print("Downloaded Content-Style Tokenizer (vq8192)")
|
244 |
-
else:
|
245 |
-
print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
|
246 |
-
content_style_tokenizer_ckpt_path = os.path.join(
|
247 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
|
248 |
-
)
|
249 |
|
250 |
-
#
|
251 |
-
|
252 |
-
if
|
|
|
|
|
|
|
|
|
|
|
253 |
local_dir = snapshot_download(
|
254 |
repo_id="amphion/Vevo",
|
255 |
repo_type="model",
|
256 |
cache_dir="./ckpts/Vevo",
|
257 |
allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
|
258 |
)
|
259 |
-
ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
|
260 |
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
|
261 |
-
downloaded_resources["ar_Vq32ToVq8192"] = True
|
262 |
-
print("Downloaded Autoregressive Transformer (Vq32ToVq8192)")
|
263 |
-
else:
|
264 |
-
print("Autoregressive Transformer (Vq32ToVq8192) already downloaded, skipping...")
|
265 |
-
ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
|
266 |
-
ar_ckpt_path = os.path.join(
|
267 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "contentstyle_modeling/Vq32ToVq8192"
|
268 |
-
)
|
269 |
|
270 |
-
#
|
271 |
-
|
272 |
-
if
|
|
|
|
|
|
|
|
|
|
|
273 |
local_dir = snapshot_download(
|
274 |
repo_id="amphion/Vevo",
|
275 |
repo_type="model",
|
276 |
cache_dir="./ckpts/Vevo",
|
277 |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
278 |
)
|
279 |
-
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
280 |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
|
281 |
-
downloaded_resources["fmt_Vq8192ToMels"] = True
|
282 |
-
print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
|
283 |
-
else:
|
284 |
-
print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
|
285 |
-
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
286 |
-
fmt_ckpt_path = os.path.join(
|
287 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
|
288 |
-
)
|
289 |
|
290 |
-
#
|
291 |
-
|
292 |
-
if
|
|
|
|
|
|
|
|
|
|
|
293 |
local_dir = snapshot_download(
|
294 |
repo_id="amphion/Vevo",
|
295 |
repo_type="model",
|
296 |
cache_dir="./ckpts/Vevo",
|
297 |
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
298 |
)
|
299 |
-
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
300 |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
|
301 |
-
downloaded_resources["vocoder"] = True
|
302 |
-
print("Downloaded Vocoder")
|
303 |
-
else:
|
304 |
-
print("Vocoder already downloaded, skipping...")
|
305 |
-
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
306 |
-
vocoder_ckpt_path = os.path.join(
|
307 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
|
308 |
-
)
|
309 |
|
310 |
# Initialize pipeline
|
311 |
inference_pipeline = VevoInferencePipeline(
|
@@ -321,9 +408,13 @@ def get_pipeline(pipeline_type):
|
|
321 |
)
|
322 |
|
323 |
elif pipeline_type == "timbre":
|
324 |
-
#
|
325 |
-
|
326 |
-
|
|
|
|
|
|
|
|
|
327 |
local_dir = snapshot_download(
|
328 |
repo_id="amphion/Vevo",
|
329 |
repo_type="model",
|
@@ -331,53 +422,38 @@ def get_pipeline(pipeline_type):
|
|
331 |
allow_patterns=["tokenizer/vq8192/*"],
|
332 |
)
|
333 |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
|
334 |
-
downloaded_resources["tokenizer_vq8192"] = True
|
335 |
-
print("Downloaded Content-Style Tokenizer (vq8192)")
|
336 |
-
else:
|
337 |
-
print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
|
338 |
-
content_style_tokenizer_ckpt_path = os.path.join(
|
339 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
|
340 |
-
)
|
341 |
|
342 |
-
#
|
343 |
-
|
344 |
-
if
|
|
|
|
|
|
|
|
|
|
|
345 |
local_dir = snapshot_download(
|
346 |
repo_id="amphion/Vevo",
|
347 |
repo_type="model",
|
348 |
cache_dir="./ckpts/Vevo",
|
349 |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
350 |
)
|
351 |
-
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
352 |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
|
353 |
-
downloaded_resources["fmt_Vq8192ToMels"] = True
|
354 |
-
print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
|
355 |
-
else:
|
356 |
-
print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
|
357 |
-
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
358 |
-
fmt_ckpt_path = os.path.join(
|
359 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
|
360 |
-
)
|
361 |
|
362 |
-
#
|
363 |
-
|
364 |
-
if
|
|
|
|
|
|
|
|
|
|
|
365 |
local_dir = snapshot_download(
|
366 |
repo_id="amphion/Vevo",
|
367 |
repo_type="model",
|
368 |
cache_dir="./ckpts/Vevo",
|
369 |
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
370 |
)
|
371 |
-
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
372 |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
|
373 |
-
downloaded_resources["vocoder"] = True
|
374 |
-
print("Downloaded Vocoder")
|
375 |
-
else:
|
376 |
-
print("Vocoder already downloaded, skipping...")
|
377 |
-
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
378 |
-
vocoder_ckpt_path = os.path.join(
|
379 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
|
380 |
-
)
|
381 |
|
382 |
# Initialize pipeline
|
383 |
inference_pipeline = VevoInferencePipeline(
|
@@ -390,9 +466,13 @@ def get_pipeline(pipeline_type):
|
|
390 |
)
|
391 |
|
392 |
elif pipeline_type == "tts":
|
393 |
-
#
|
394 |
-
|
395 |
-
|
|
|
|
|
|
|
|
|
396 |
local_dir = snapshot_download(
|
397 |
repo_id="amphion/Vevo",
|
398 |
repo_type="model",
|
@@ -400,73 +480,54 @@ def get_pipeline(pipeline_type):
|
|
400 |
allow_patterns=["tokenizer/vq8192/*"],
|
401 |
)
|
402 |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
|
403 |
-
downloaded_resources["tokenizer_vq8192"] = True
|
404 |
-
print("Downloaded Content-Style Tokenizer (vq8192)")
|
405 |
-
else:
|
406 |
-
print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
|
407 |
-
content_style_tokenizer_ckpt_path = os.path.join(
|
408 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
|
409 |
-
)
|
410 |
|
411 |
-
#
|
412 |
-
|
413 |
-
if
|
|
|
|
|
|
|
|
|
|
|
414 |
local_dir = snapshot_download(
|
415 |
repo_id="amphion/Vevo",
|
416 |
repo_type="model",
|
417 |
cache_dir="./ckpts/Vevo",
|
418 |
allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
|
419 |
)
|
420 |
-
ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
|
421 |
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
|
422 |
-
downloaded_resources["ar_PhoneToVq8192"] = True
|
423 |
-
print("Downloaded Autoregressive Transformer (PhoneToVq8192)")
|
424 |
-
else:
|
425 |
-
print("Autoregressive Transformer (PhoneToVq8192) already downloaded, skipping...")
|
426 |
-
ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
|
427 |
-
ar_ckpt_path = os.path.join(
|
428 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "contentstyle_modeling/PhoneToVq8192"
|
429 |
-
)
|
430 |
|
431 |
-
#
|
432 |
-
|
433 |
-
if
|
|
|
|
|
|
|
|
|
|
|
434 |
local_dir = snapshot_download(
|
435 |
repo_id="amphion/Vevo",
|
436 |
repo_type="model",
|
437 |
cache_dir="./ckpts/Vevo",
|
438 |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
439 |
)
|
440 |
-
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
441 |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
|
442 |
-
downloaded_resources["fmt_Vq8192ToMels"] = True
|
443 |
-
print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
|
444 |
-
else:
|
445 |
-
print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
|
446 |
-
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
447 |
-
fmt_ckpt_path = os.path.join(
|
448 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
|
449 |
-
)
|
450 |
|
451 |
-
#
|
452 |
-
|
453 |
-
if
|
|
|
|
|
|
|
|
|
|
|
454 |
local_dir = snapshot_download(
|
455 |
repo_id="amphion/Vevo",
|
456 |
repo_type="model",
|
457 |
cache_dir="./ckpts/Vevo",
|
458 |
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
459 |
)
|
460 |
-
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
461 |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
|
462 |
-
downloaded_resources["vocoder"] = True
|
463 |
-
print("Downloaded Vocoder")
|
464 |
-
else:
|
465 |
-
print("Vocoder already downloaded, skipping...")
|
466 |
-
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
467 |
-
vocoder_ckpt_path = os.path.join(
|
468 |
-
"./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
|
469 |
-
)
|
470 |
|
471 |
# Initialize pipeline
|
472 |
inference_pipeline = VevoInferencePipeline(
|
@@ -895,98 +956,6 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
|
|
895 |
traceback.print_exc()
|
896 |
raise e
|
897 |
|
898 |
-
# 在程序启动时下载所有需要的模型资源
|
899 |
-
# Download all necessary model resources at startup
|
900 |
-
def preload_all_resources():
|
901 |
-
print("Preloading all model resources...")
|
902 |
-
# 下载配置文件
|
903 |
-
# Download configuration files
|
904 |
-
setup_configs()
|
905 |
-
|
906 |
-
# 下载Content Tokenizer (vq32)
|
907 |
-
# Download Content Tokenizer (vq32)
|
908 |
-
if not downloaded_resources["tokenizer_vq32"]:
|
909 |
-
print("Preloading Content Tokenizer (vq32)...")
|
910 |
-
local_dir = snapshot_download(
|
911 |
-
repo_id="amphion/Vevo",
|
912 |
-
repo_type="model",
|
913 |
-
cache_dir="./ckpts/Vevo",
|
914 |
-
allow_patterns=["tokenizer/vq32/*"],
|
915 |
-
)
|
916 |
-
downloaded_resources["tokenizer_vq32"] = True
|
917 |
-
print("Content Tokenizer (vq32) download completed")
|
918 |
-
|
919 |
-
# 下载Content-Style Tokenizer (vq8192)
|
920 |
-
# Download Content-Style Tokenizer (vq8192)
|
921 |
-
if not downloaded_resources["tokenizer_vq8192"]:
|
922 |
-
print("Preloading Content-Style Tokenizer (vq8192)...")
|
923 |
-
local_dir = snapshot_download(
|
924 |
-
repo_id="amphion/Vevo",
|
925 |
-
repo_type="model",
|
926 |
-
cache_dir="./ckpts/Vevo",
|
927 |
-
allow_patterns=["tokenizer/vq8192/*"],
|
928 |
-
)
|
929 |
-
downloaded_resources["tokenizer_vq8192"] = True
|
930 |
-
print("Content-Style Tokenizer (vq8192) download completed")
|
931 |
-
|
932 |
-
# 下载Autoregressive Transformer (Vq32ToVq8192)
|
933 |
-
# Download Autoregressive Transformer (Vq32ToVq8192)
|
934 |
-
if not downloaded_resources["ar_Vq32ToVq8192"]:
|
935 |
-
print("Preloading Autoregressive Transformer (Vq32ToVq8192)...")
|
936 |
-
local_dir = snapshot_download(
|
937 |
-
repo_id="amphion/Vevo",
|
938 |
-
repo_type="model",
|
939 |
-
cache_dir="./ckpts/Vevo",
|
940 |
-
allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
|
941 |
-
)
|
942 |
-
downloaded_resources["ar_Vq32ToVq8192"] = True
|
943 |
-
print("Autoregressive Transformer (Vq32ToVq8192) download completed")
|
944 |
-
|
945 |
-
# 下载Autoregressive Transformer (PhoneToVq8192)
|
946 |
-
# Download Autoregressive Transformer (PhoneToVq8192)
|
947 |
-
if not downloaded_resources["ar_PhoneToVq8192"]:
|
948 |
-
print("Preloading Autoregressive Transformer (PhoneToVq8192)...")
|
949 |
-
local_dir = snapshot_download(
|
950 |
-
repo_id="amphion/Vevo",
|
951 |
-
repo_type="model",
|
952 |
-
cache_dir="./ckpts/Vevo",
|
953 |
-
allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
|
954 |
-
)
|
955 |
-
downloaded_resources["ar_PhoneToVq8192"] = True
|
956 |
-
print("Autoregressive Transformer (PhoneToVq8192) download completed")
|
957 |
-
|
958 |
-
# 下载Flow Matching Transformer
|
959 |
-
# Download Flow Matching Transformer
|
960 |
-
if not downloaded_resources["fmt_Vq8192ToMels"]:
|
961 |
-
print("Preloading Flow Matching Transformer (Vq8192ToMels)...")
|
962 |
-
local_dir = snapshot_download(
|
963 |
-
repo_id="amphion/Vevo",
|
964 |
-
repo_type="model",
|
965 |
-
cache_dir="./ckpts/Vevo",
|
966 |
-
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
967 |
-
)
|
968 |
-
downloaded_resources["fmt_Vq8192ToMels"] = True
|
969 |
-
print("Flow Matching Transformer (Vq8192ToMels) download completed")
|
970 |
-
|
971 |
-
# 下载Vocoder
|
972 |
-
# Download Vocoder
|
973 |
-
if not downloaded_resources["vocoder"]:
|
974 |
-
print("Preloading Vocoder...")
|
975 |
-
local_dir = snapshot_download(
|
976 |
-
repo_id="amphion/Vevo",
|
977 |
-
repo_type="model",
|
978 |
-
cache_dir="./ckpts/Vevo",
|
979 |
-
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
980 |
-
)
|
981 |
-
downloaded_resources["vocoder"] = True
|
982 |
-
print("Vocoder download completed")
|
983 |
-
|
984 |
-
print("All model resources preloading completed!")
|
985 |
-
|
986 |
-
# 在创建Gradio界面之前预加载所有资源
|
987 |
-
# Preload all resources before creating the Gradio interface
|
988 |
-
preload_all_resources()
|
989 |
-
|
990 |
# Create Gradio interface
|
991 |
with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
|
992 |
gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
|
|
|
203 |
# Initialize pipeline dictionary
|
204 |
inference_pipelines = {}
|
205 |
|
206 |
+
# Download all necessary model resources at startup
|
207 |
+
def preload_all_resources():
|
208 |
+
print("Preloading all model resources...")
|
209 |
+
# Download configuration files
|
210 |
+
setup_configs()
|
211 |
+
|
212 |
+
# Store the downloaded model paths
|
213 |
+
global downloaded_content_tokenizer_path
|
214 |
+
global downloaded_content_style_tokenizer_path
|
215 |
+
global downloaded_ar_vq32_path
|
216 |
+
global downloaded_ar_phone_path
|
217 |
+
global downloaded_fmt_path
|
218 |
+
global downloaded_vocoder_path
|
219 |
+
|
220 |
+
# Download Content Tokenizer (vq32)
|
221 |
+
if not downloaded_resources["tokenizer_vq32"]:
|
222 |
+
print("Preloading Content Tokenizer (vq32)...")
|
223 |
+
local_dir = snapshot_download(
|
224 |
+
repo_id="amphion/Vevo",
|
225 |
+
repo_type="model",
|
226 |
+
cache_dir="./ckpts/Vevo",
|
227 |
+
allow_patterns=["tokenizer/vq32/*"],
|
228 |
+
)
|
229 |
+
downloaded_content_tokenizer_path = local_dir
|
230 |
+
downloaded_resources["tokenizer_vq32"] = True
|
231 |
+
print("Content Tokenizer (vq32) download completed")
|
232 |
+
|
233 |
+
# Download Content-Style Tokenizer (vq8192)
|
234 |
+
if not downloaded_resources["tokenizer_vq8192"]:
|
235 |
+
print("Preloading Content-Style Tokenizer (vq8192)...")
|
236 |
+
local_dir = snapshot_download(
|
237 |
+
repo_id="amphion/Vevo",
|
238 |
+
repo_type="model",
|
239 |
+
cache_dir="./ckpts/Vevo",
|
240 |
+
allow_patterns=["tokenizer/vq8192/*"],
|
241 |
+
)
|
242 |
+
downloaded_content_style_tokenizer_path = local_dir
|
243 |
+
downloaded_resources["tokenizer_vq8192"] = True
|
244 |
+
print("Content-Style Tokenizer (vq8192) download completed")
|
245 |
+
|
246 |
+
# Download Autoregressive Transformer (Vq32ToVq8192)
|
247 |
+
if not downloaded_resources["ar_Vq32ToVq8192"]:
|
248 |
+
print("Preloading Autoregressive Transformer (Vq32ToVq8192)...")
|
249 |
+
local_dir = snapshot_download(
|
250 |
+
repo_id="amphion/Vevo",
|
251 |
+
repo_type="model",
|
252 |
+
cache_dir="./ckpts/Vevo",
|
253 |
+
allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
|
254 |
+
)
|
255 |
+
downloaded_ar_vq32_path = local_dir
|
256 |
+
downloaded_resources["ar_Vq32ToVq8192"] = True
|
257 |
+
print("Autoregressive Transformer (Vq32ToVq8192) download completed")
|
258 |
+
|
259 |
+
# Download Autoregressive Transformer (PhoneToVq8192)
|
260 |
+
if not downloaded_resources["ar_PhoneToVq8192"]:
|
261 |
+
print("Preloading Autoregressive Transformer (PhoneToVq8192)...")
|
262 |
+
local_dir = snapshot_download(
|
263 |
+
repo_id="amphion/Vevo",
|
264 |
+
repo_type="model",
|
265 |
+
cache_dir="./ckpts/Vevo",
|
266 |
+
allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
|
267 |
+
)
|
268 |
+
downloaded_ar_phone_path = local_dir
|
269 |
+
downloaded_resources["ar_PhoneToVq8192"] = True
|
270 |
+
print("Autoregressive Transformer (PhoneToVq8192) download completed")
|
271 |
+
|
272 |
+
# Download Flow Matching Transformer
|
273 |
+
if not downloaded_resources["fmt_Vq8192ToMels"]:
|
274 |
+
print("Preloading Flow Matching Transformer (Vq8192ToMels)...")
|
275 |
+
local_dir = snapshot_download(
|
276 |
+
repo_id="amphion/Vevo",
|
277 |
+
repo_type="model",
|
278 |
+
cache_dir="./ckpts/Vevo",
|
279 |
+
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
280 |
+
)
|
281 |
+
downloaded_fmt_path = local_dir
|
282 |
+
downloaded_resources["fmt_Vq8192ToMels"] = True
|
283 |
+
print("Flow Matching Transformer (Vq8192ToMels) download completed")
|
284 |
+
|
285 |
+
# Download Vocoder
|
286 |
+
if not downloaded_resources["vocoder"]:
|
287 |
+
print("Preloading Vocoder...")
|
288 |
+
local_dir = snapshot_download(
|
289 |
+
repo_id="amphion/Vevo",
|
290 |
+
repo_type="model",
|
291 |
+
cache_dir="./ckpts/Vevo",
|
292 |
+
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
293 |
+
)
|
294 |
+
downloaded_vocoder_path = local_dir
|
295 |
+
downloaded_resources["vocoder"] = True
|
296 |
+
print("Vocoder download completed")
|
297 |
+
|
298 |
+
print("All model resources preloading completed!")
|
299 |
+
|
300 |
+
# Initialize path variables to store downloaded model paths
|
301 |
+
downloaded_content_tokenizer_path = None
|
302 |
+
downloaded_content_style_tokenizer_path = None
|
303 |
+
downloaded_ar_vq32_path = None
|
304 |
+
downloaded_ar_phone_path = None
|
305 |
+
downloaded_fmt_path = None
|
306 |
+
downloaded_vocoder_path = None
|
307 |
+
|
308 |
+
# Preload all resources before creating the Gradio interface
|
309 |
+
preload_all_resources()
|
310 |
+
|
311 |
def get_pipeline(pipeline_type):
|
312 |
if pipeline_type in inference_pipelines:
|
313 |
return inference_pipelines[pipeline_type]
|
314 |
|
315 |
# Initialize pipeline based on the required pipeline type
|
316 |
if pipeline_type == "style" or pipeline_type == "voice":
|
317 |
+
# Use already downloaded Content Tokenizer
|
318 |
+
if downloaded_resources["tokenizer_vq32"]:
|
319 |
+
content_tokenizer_ckpt_path = os.path.join(
|
320 |
+
downloaded_content_tokenizer_path, "tokenizer/vq32/hubert_large_l18_c32.pkl"
|
321 |
+
)
|
322 |
+
else:
|
323 |
+
# Fallback to direct download
|
324 |
local_dir = snapshot_download(
|
325 |
repo_id="amphion/Vevo",
|
326 |
repo_type="model",
|
|
|
330 |
content_tokenizer_ckpt_path = os.path.join(
|
331 |
local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
|
332 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
+
# Use already downloaded Content-Style Tokenizer
|
335 |
+
if downloaded_resources["tokenizer_vq8192"]:
|
336 |
+
content_style_tokenizer_ckpt_path = os.path.join(
|
337 |
+
downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
|
338 |
+
)
|
339 |
+
else:
|
340 |
+
# Fallback to direct download
|
341 |
local_dir = snapshot_download(
|
342 |
repo_id="amphion/Vevo",
|
343 |
repo_type="model",
|
|
|
345 |
allow_patterns=["tokenizer/vq8192/*"],
|
346 |
)
|
347 |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
+
# Use already downloaded Autoregressive Transformer
|
350 |
+
ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
|
351 |
+
if downloaded_resources["ar_Vq32ToVq8192"]:
|
352 |
+
ar_ckpt_path = os.path.join(
|
353 |
+
downloaded_ar_vq32_path, "contentstyle_modeling/Vq32ToVq8192"
|
354 |
+
)
|
355 |
+
else:
|
356 |
+
# Fallback to direct download
|
357 |
local_dir = snapshot_download(
|
358 |
repo_id="amphion/Vevo",
|
359 |
repo_type="model",
|
360 |
cache_dir="./ckpts/Vevo",
|
361 |
allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
|
362 |
)
|
|
|
363 |
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
+
# Use already downloaded Flow Matching Transformer
|
366 |
+
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
367 |
+
if downloaded_resources["fmt_Vq8192ToMels"]:
|
368 |
+
fmt_ckpt_path = os.path.join(
|
369 |
+
downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
|
370 |
+
)
|
371 |
+
else:
|
372 |
+
# Fallback to direct download
|
373 |
local_dir = snapshot_download(
|
374 |
repo_id="amphion/Vevo",
|
375 |
repo_type="model",
|
376 |
cache_dir="./ckpts/Vevo",
|
377 |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
378 |
)
|
|
|
379 |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
|
381 |
+
# Use already downloaded Vocoder
|
382 |
+
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
383 |
+
if downloaded_resources["vocoder"]:
|
384 |
+
vocoder_ckpt_path = os.path.join(
|
385 |
+
downloaded_vocoder_path, "acoustic_modeling/Vocoder"
|
386 |
+
)
|
387 |
+
else:
|
388 |
+
# Fallback to direct download
|
389 |
local_dir = snapshot_download(
|
390 |
repo_id="amphion/Vevo",
|
391 |
repo_type="model",
|
392 |
cache_dir="./ckpts/Vevo",
|
393 |
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
394 |
)
|
|
|
395 |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
|
397 |
# Initialize pipeline
|
398 |
inference_pipeline = VevoInferencePipeline(
|
|
|
408 |
)
|
409 |
|
410 |
elif pipeline_type == "timbre":
|
411 |
+
# Use already downloaded Content-Style Tokenizer
|
412 |
+
if downloaded_resources["tokenizer_vq8192"]:
|
413 |
+
content_style_tokenizer_ckpt_path = os.path.join(
|
414 |
+
downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
|
415 |
+
)
|
416 |
+
else:
|
417 |
+
# Fallback to direct download
|
418 |
local_dir = snapshot_download(
|
419 |
repo_id="amphion/Vevo",
|
420 |
repo_type="model",
|
|
|
422 |
allow_patterns=["tokenizer/vq8192/*"],
|
423 |
)
|
424 |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
425 |
|
426 |
+
# Use already downloaded Flow Matching Transformer
|
427 |
+
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
428 |
+
if downloaded_resources["fmt_Vq8192ToMels"]:
|
429 |
+
fmt_ckpt_path = os.path.join(
|
430 |
+
downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
|
431 |
+
)
|
432 |
+
else:
|
433 |
+
# Fallback to direct download
|
434 |
local_dir = snapshot_download(
|
435 |
repo_id="amphion/Vevo",
|
436 |
repo_type="model",
|
437 |
cache_dir="./ckpts/Vevo",
|
438 |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
439 |
)
|
|
|
440 |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
441 |
|
442 |
+
# Use already downloaded Vocoder
|
443 |
+
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
444 |
+
if downloaded_resources["vocoder"]:
|
445 |
+
vocoder_ckpt_path = os.path.join(
|
446 |
+
downloaded_vocoder_path, "acoustic_modeling/Vocoder"
|
447 |
+
)
|
448 |
+
else:
|
449 |
+
# Fallback to direct download
|
450 |
local_dir = snapshot_download(
|
451 |
repo_id="amphion/Vevo",
|
452 |
repo_type="model",
|
453 |
cache_dir="./ckpts/Vevo",
|
454 |
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
455 |
)
|
|
|
456 |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
|
458 |
# Initialize pipeline
|
459 |
inference_pipeline = VevoInferencePipeline(
|
|
|
466 |
)
|
467 |
|
468 |
elif pipeline_type == "tts":
|
469 |
+
# Use already downloaded Content-Style Tokenizer
|
470 |
+
if downloaded_resources["tokenizer_vq8192"]:
|
471 |
+
content_style_tokenizer_ckpt_path = os.path.join(
|
472 |
+
downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
|
473 |
+
)
|
474 |
+
else:
|
475 |
+
# Fallback to direct download
|
476 |
local_dir = snapshot_download(
|
477 |
repo_id="amphion/Vevo",
|
478 |
repo_type="model",
|
|
|
480 |
allow_patterns=["tokenizer/vq8192/*"],
|
481 |
)
|
482 |
content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
|
484 |
+
# Use already downloaded Autoregressive Transformer (TTS specific)
|
485 |
+
ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
|
486 |
+
if downloaded_resources["ar_PhoneToVq8192"]:
|
487 |
+
ar_ckpt_path = os.path.join(
|
488 |
+
downloaded_ar_phone_path, "contentstyle_modeling/PhoneToVq8192"
|
489 |
+
)
|
490 |
+
else:
|
491 |
+
# Fallback to direct download
|
492 |
local_dir = snapshot_download(
|
493 |
repo_id="amphion/Vevo",
|
494 |
repo_type="model",
|
495 |
cache_dir="./ckpts/Vevo",
|
496 |
allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
|
497 |
)
|
|
|
498 |
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
|
500 |
+
# Use already downloaded Flow Matching Transformer
|
501 |
+
fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
|
502 |
+
if downloaded_resources["fmt_Vq8192ToMels"]:
|
503 |
+
fmt_ckpt_path = os.path.join(
|
504 |
+
downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
|
505 |
+
)
|
506 |
+
else:
|
507 |
+
# Fallback to direct download
|
508 |
local_dir = snapshot_download(
|
509 |
repo_id="amphion/Vevo",
|
510 |
repo_type="model",
|
511 |
cache_dir="./ckpts/Vevo",
|
512 |
allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
|
513 |
)
|
|
|
514 |
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
+
# Use already downloaded Vocoder
|
517 |
+
vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
|
518 |
+
if downloaded_resources["vocoder"]:
|
519 |
+
vocoder_ckpt_path = os.path.join(
|
520 |
+
downloaded_vocoder_path, "acoustic_modeling/Vocoder"
|
521 |
+
)
|
522 |
+
else:
|
523 |
+
# Fallback to direct download
|
524 |
local_dir = snapshot_download(
|
525 |
repo_id="amphion/Vevo",
|
526 |
repo_type="model",
|
527 |
cache_dir="./ckpts/Vevo",
|
528 |
allow_patterns=["acoustic_modeling/Vocoder/*"],
|
529 |
)
|
|
|
530 |
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
531 |
|
532 |
# Initialize pipeline
|
533 |
inference_pipeline = VevoInferencePipeline(
|
|
|
956 |
traceback.print_exc()
|
957 |
raise e
|
958 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
959 |
# Create Gradio interface
|
960 |
with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
|
961 |
gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
|