积极的屁孩 commited on
Commit
9755f3f
·
1 Parent(s): 2fc31e9
Files changed (1) hide show
  1. app.py +197 -228
app.py CHANGED
@@ -203,15 +203,124 @@ print(f"Using device: {device}")
203
  # Initialize pipeline dictionary
204
  inference_pipelines = {}
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def get_pipeline(pipeline_type):
207
  if pipeline_type in inference_pipelines:
208
  return inference_pipelines[pipeline_type]
209
 
210
  # Initialize pipeline based on the required pipeline type
211
  if pipeline_type == "style" or pipeline_type == "voice":
212
- # Download Content Tokenizer
213
- content_tokenizer_ckpt_path = ""
214
- if not downloaded_resources["tokenizer_vq32"]:
 
 
 
 
215
  local_dir = snapshot_download(
216
  repo_id="amphion/Vevo",
217
  repo_type="model",
@@ -221,17 +330,14 @@ def get_pipeline(pipeline_type):
221
  content_tokenizer_ckpt_path = os.path.join(
222
  local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
223
  )
224
- downloaded_resources["tokenizer_vq32"] = True
225
- print("Downloaded Content Tokenizer (vq32)")
226
- else:
227
- print("Content Tokenizer (vq32) already downloaded, skipping...")
228
- content_tokenizer_ckpt_path = os.path.join(
229
- "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq32/hubert_large_l18_c32.pkl"
230
- )
231
 
232
- # Download Content-Style Tokenizer
233
- content_style_tokenizer_ckpt_path = ""
234
- if not downloaded_resources["tokenizer_vq8192"]:
 
 
 
 
235
  local_dir = snapshot_download(
236
  repo_id="amphion/Vevo",
237
  repo_type="model",
@@ -239,73 +345,54 @@ def get_pipeline(pipeline_type):
239
  allow_patterns=["tokenizer/vq8192/*"],
240
  )
241
  content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
242
- downloaded_resources["tokenizer_vq8192"] = True
243
- print("Downloaded Content-Style Tokenizer (vq8192)")
244
- else:
245
- print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
246
- content_style_tokenizer_ckpt_path = os.path.join(
247
- "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
248
- )
249
 
250
- # Download Autoregressive Transformer
251
- ar_ckpt_path = ""
252
- if not downloaded_resources["ar_Vq32ToVq8192"]:
 
 
 
 
 
253
  local_dir = snapshot_download(
254
  repo_id="amphion/Vevo",
255
  repo_type="model",
256
  cache_dir="./ckpts/Vevo",
257
  allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
258
  )
259
- ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
260
  ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
261
- downloaded_resources["ar_Vq32ToVq8192"] = True
262
- print("Downloaded Autoregressive Transformer (Vq32ToVq8192)")
263
- else:
264
- print("Autoregressive Transformer (Vq32ToVq8192) already downloaded, skipping...")
265
- ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
266
- ar_ckpt_path = os.path.join(
267
- "./ckpts/Vevo/snapshots/amphion/Vevo", "contentstyle_modeling/Vq32ToVq8192"
268
- )
269
 
270
- # Download Flow Matching Transformer
271
- fmt_ckpt_path = ""
272
- if not downloaded_resources["fmt_Vq8192ToMels"]:
 
 
 
 
 
273
  local_dir = snapshot_download(
274
  repo_id="amphion/Vevo",
275
  repo_type="model",
276
  cache_dir="./ckpts/Vevo",
277
  allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
278
  )
279
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
280
  fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
281
- downloaded_resources["fmt_Vq8192ToMels"] = True
282
- print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
283
- else:
284
- print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
285
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
286
- fmt_ckpt_path = os.path.join(
287
- "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
288
- )
289
 
290
- # Download Vocoder
291
- vocoder_ckpt_path = ""
292
- if not downloaded_resources["vocoder"]:
 
 
 
 
 
293
  local_dir = snapshot_download(
294
  repo_id="amphion/Vevo",
295
  repo_type="model",
296
  cache_dir="./ckpts/Vevo",
297
  allow_patterns=["acoustic_modeling/Vocoder/*"],
298
  )
299
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
300
  vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
301
- downloaded_resources["vocoder"] = True
302
- print("Downloaded Vocoder")
303
- else:
304
- print("Vocoder already downloaded, skipping...")
305
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
306
- vocoder_ckpt_path = os.path.join(
307
- "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
308
- )
309
 
310
  # Initialize pipeline
311
  inference_pipeline = VevoInferencePipeline(
@@ -321,9 +408,13 @@ def get_pipeline(pipeline_type):
321
  )
322
 
323
  elif pipeline_type == "timbre":
324
- # Download Content-Style Tokenizer (only needed for timbre)
325
- content_style_tokenizer_ckpt_path = ""
326
- if not downloaded_resources["tokenizer_vq8192"]:
 
 
 
 
327
  local_dir = snapshot_download(
328
  repo_id="amphion/Vevo",
329
  repo_type="model",
@@ -331,53 +422,38 @@ def get_pipeline(pipeline_type):
331
  allow_patterns=["tokenizer/vq8192/*"],
332
  )
333
  content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
334
- downloaded_resources["tokenizer_vq8192"] = True
335
- print("Downloaded Content-Style Tokenizer (vq8192)")
336
- else:
337
- print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
338
- content_style_tokenizer_ckpt_path = os.path.join(
339
- "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
340
- )
341
 
342
- # Download Flow Matching Transformer
343
- fmt_ckpt_path = ""
344
- if not downloaded_resources["fmt_Vq8192ToMels"]:
 
 
 
 
 
345
  local_dir = snapshot_download(
346
  repo_id="amphion/Vevo",
347
  repo_type="model",
348
  cache_dir="./ckpts/Vevo",
349
  allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
350
  )
351
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
352
  fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
353
- downloaded_resources["fmt_Vq8192ToMels"] = True
354
- print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
355
- else:
356
- print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
357
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
358
- fmt_ckpt_path = os.path.join(
359
- "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
360
- )
361
 
362
- # Download Vocoder
363
- vocoder_ckpt_path = ""
364
- if not downloaded_resources["vocoder"]:
 
 
 
 
 
365
  local_dir = snapshot_download(
366
  repo_id="amphion/Vevo",
367
  repo_type="model",
368
  cache_dir="./ckpts/Vevo",
369
  allow_patterns=["acoustic_modeling/Vocoder/*"],
370
  )
371
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
372
  vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
373
- downloaded_resources["vocoder"] = True
374
- print("Downloaded Vocoder")
375
- else:
376
- print("Vocoder already downloaded, skipping...")
377
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
378
- vocoder_ckpt_path = os.path.join(
379
- "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
380
- )
381
 
382
  # Initialize pipeline
383
  inference_pipeline = VevoInferencePipeline(
@@ -390,9 +466,13 @@ def get_pipeline(pipeline_type):
390
  )
391
 
392
  elif pipeline_type == "tts":
393
- # Download Content-Style Tokenizer
394
- content_style_tokenizer_ckpt_path = ""
395
- if not downloaded_resources["tokenizer_vq8192"]:
 
 
 
 
396
  local_dir = snapshot_download(
397
  repo_id="amphion/Vevo",
398
  repo_type="model",
@@ -400,73 +480,54 @@ def get_pipeline(pipeline_type):
400
  allow_patterns=["tokenizer/vq8192/*"],
401
  )
402
  content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
403
- downloaded_resources["tokenizer_vq8192"] = True
404
- print("Downloaded Content-Style Tokenizer (vq8192)")
405
- else:
406
- print("Content-Style Tokenizer (vq8192) already downloaded, skipping...")
407
- content_style_tokenizer_ckpt_path = os.path.join(
408
- "./ckpts/Vevo/snapshots/amphion/Vevo", "tokenizer/vq8192"
409
- )
410
 
411
- # Download Autoregressive Transformer (TTS specific)
412
- ar_ckpt_path = ""
413
- if not downloaded_resources["ar_PhoneToVq8192"]:
 
 
 
 
 
414
  local_dir = snapshot_download(
415
  repo_id="amphion/Vevo",
416
  repo_type="model",
417
  cache_dir="./ckpts/Vevo",
418
  allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
419
  )
420
- ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
421
  ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
422
- downloaded_resources["ar_PhoneToVq8192"] = True
423
- print("Downloaded Autoregressive Transformer (PhoneToVq8192)")
424
- else:
425
- print("Autoregressive Transformer (PhoneToVq8192) already downloaded, skipping...")
426
- ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
427
- ar_ckpt_path = os.path.join(
428
- "./ckpts/Vevo/snapshots/amphion/Vevo", "contentstyle_modeling/PhoneToVq8192"
429
- )
430
 
431
- # Download Flow Matching Transformer
432
- fmt_ckpt_path = ""
433
- if not downloaded_resources["fmt_Vq8192ToMels"]:
 
 
 
 
 
434
  local_dir = snapshot_download(
435
  repo_id="amphion/Vevo",
436
  repo_type="model",
437
  cache_dir="./ckpts/Vevo",
438
  allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
439
  )
440
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
441
  fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
442
- downloaded_resources["fmt_Vq8192ToMels"] = True
443
- print("Downloaded Flow Matching Transformer (Vq8192ToMels)")
444
- else:
445
- print("Flow Matching Transformer (Vq8192ToMels) already downloaded, skipping...")
446
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
447
- fmt_ckpt_path = os.path.join(
448
- "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vq8192ToMels"
449
- )
450
 
451
- # Download Vocoder
452
- vocoder_ckpt_path = ""
453
- if not downloaded_resources["vocoder"]:
 
 
 
 
 
454
  local_dir = snapshot_download(
455
  repo_id="amphion/Vevo",
456
  repo_type="model",
457
  cache_dir="./ckpts/Vevo",
458
  allow_patterns=["acoustic_modeling/Vocoder/*"],
459
  )
460
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
461
  vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
462
- downloaded_resources["vocoder"] = True
463
- print("Downloaded Vocoder")
464
- else:
465
- print("Vocoder already downloaded, skipping...")
466
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
467
- vocoder_ckpt_path = os.path.join(
468
- "./ckpts/Vevo/snapshots/amphion/Vevo", "acoustic_modeling/Vocoder"
469
- )
470
 
471
  # Initialize pipeline
472
  inference_pipeline = VevoInferencePipeline(
@@ -895,98 +956,6 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_langua
895
  traceback.print_exc()
896
  raise e
897
 
898
- # 在程序启动时下载所有需要的模型资源
899
- # Download all necessary model resources at startup
900
- def preload_all_resources():
901
- print("Preloading all model resources...")
902
- # 下载配置文件
903
- # Download configuration files
904
- setup_configs()
905
-
906
- # 下载Content Tokenizer (vq32)
907
- # Download Content Tokenizer (vq32)
908
- if not downloaded_resources["tokenizer_vq32"]:
909
- print("Preloading Content Tokenizer (vq32)...")
910
- local_dir = snapshot_download(
911
- repo_id="amphion/Vevo",
912
- repo_type="model",
913
- cache_dir="./ckpts/Vevo",
914
- allow_patterns=["tokenizer/vq32/*"],
915
- )
916
- downloaded_resources["tokenizer_vq32"] = True
917
- print("Content Tokenizer (vq32) download completed")
918
-
919
- # 下载Content-Style Tokenizer (vq8192)
920
- # Download Content-Style Tokenizer (vq8192)
921
- if not downloaded_resources["tokenizer_vq8192"]:
922
- print("Preloading Content-Style Tokenizer (vq8192)...")
923
- local_dir = snapshot_download(
924
- repo_id="amphion/Vevo",
925
- repo_type="model",
926
- cache_dir="./ckpts/Vevo",
927
- allow_patterns=["tokenizer/vq8192/*"],
928
- )
929
- downloaded_resources["tokenizer_vq8192"] = True
930
- print("Content-Style Tokenizer (vq8192) download completed")
931
-
932
- # 下载Autoregressive Transformer (Vq32ToVq8192)
933
- # Download Autoregressive Transformer (Vq32ToVq8192)
934
- if not downloaded_resources["ar_Vq32ToVq8192"]:
935
- print("Preloading Autoregressive Transformer (Vq32ToVq8192)...")
936
- local_dir = snapshot_download(
937
- repo_id="amphion/Vevo",
938
- repo_type="model",
939
- cache_dir="./ckpts/Vevo",
940
- allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
941
- )
942
- downloaded_resources["ar_Vq32ToVq8192"] = True
943
- print("Autoregressive Transformer (Vq32ToVq8192) download completed")
944
-
945
- # 下载Autoregressive Transformer (PhoneToVq8192)
946
- # Download Autoregressive Transformer (PhoneToVq8192)
947
- if not downloaded_resources["ar_PhoneToVq8192"]:
948
- print("Preloading Autoregressive Transformer (PhoneToVq8192)...")
949
- local_dir = snapshot_download(
950
- repo_id="amphion/Vevo",
951
- repo_type="model",
952
- cache_dir="./ckpts/Vevo",
953
- allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
954
- )
955
- downloaded_resources["ar_PhoneToVq8192"] = True
956
- print("Autoregressive Transformer (PhoneToVq8192) download completed")
957
-
958
- # 下载Flow Matching Transformer
959
- # Download Flow Matching Transformer
960
- if not downloaded_resources["fmt_Vq8192ToMels"]:
961
- print("Preloading Flow Matching Transformer (Vq8192ToMels)...")
962
- local_dir = snapshot_download(
963
- repo_id="amphion/Vevo",
964
- repo_type="model",
965
- cache_dir="./ckpts/Vevo",
966
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
967
- )
968
- downloaded_resources["fmt_Vq8192ToMels"] = True
969
- print("Flow Matching Transformer (Vq8192ToMels) download completed")
970
-
971
- # 下载Vocoder
972
- # Download Vocoder
973
- if not downloaded_resources["vocoder"]:
974
- print("Preloading Vocoder...")
975
- local_dir = snapshot_download(
976
- repo_id="amphion/Vevo",
977
- repo_type="model",
978
- cache_dir="./ckpts/Vevo",
979
- allow_patterns=["acoustic_modeling/Vocoder/*"],
980
- )
981
- downloaded_resources["vocoder"] = True
982
- print("Vocoder download completed")
983
-
984
- print("All model resources preloading completed!")
985
-
986
- # 在创建Gradio界面之前预加载所有资源
987
- # Preload all resources before creating the Gradio interface
988
- preload_all_resources()
989
-
990
  # Create Gradio interface
991
  with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
992
  gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
 
203
  # Initialize pipeline dictionary
204
  inference_pipelines = {}
205
 
206
+ # Download all necessary model resources at startup
207
+ def preload_all_resources():
208
+ print("Preloading all model resources...")
209
+ # Download configuration files
210
+ setup_configs()
211
+
212
+ # Store the downloaded model paths
213
+ global downloaded_content_tokenizer_path
214
+ global downloaded_content_style_tokenizer_path
215
+ global downloaded_ar_vq32_path
216
+ global downloaded_ar_phone_path
217
+ global downloaded_fmt_path
218
+ global downloaded_vocoder_path
219
+
220
+ # Download Content Tokenizer (vq32)
221
+ if not downloaded_resources["tokenizer_vq32"]:
222
+ print("Preloading Content Tokenizer (vq32)...")
223
+ local_dir = snapshot_download(
224
+ repo_id="amphion/Vevo",
225
+ repo_type="model",
226
+ cache_dir="./ckpts/Vevo",
227
+ allow_patterns=["tokenizer/vq32/*"],
228
+ )
229
+ downloaded_content_tokenizer_path = local_dir
230
+ downloaded_resources["tokenizer_vq32"] = True
231
+ print("Content Tokenizer (vq32) download completed")
232
+
233
+ # Download Content-Style Tokenizer (vq8192)
234
+ if not downloaded_resources["tokenizer_vq8192"]:
235
+ print("Preloading Content-Style Tokenizer (vq8192)...")
236
+ local_dir = snapshot_download(
237
+ repo_id="amphion/Vevo",
238
+ repo_type="model",
239
+ cache_dir="./ckpts/Vevo",
240
+ allow_patterns=["tokenizer/vq8192/*"],
241
+ )
242
+ downloaded_content_style_tokenizer_path = local_dir
243
+ downloaded_resources["tokenizer_vq8192"] = True
244
+ print("Content-Style Tokenizer (vq8192) download completed")
245
+
246
+ # Download Autoregressive Transformer (Vq32ToVq8192)
247
+ if not downloaded_resources["ar_Vq32ToVq8192"]:
248
+ print("Preloading Autoregressive Transformer (Vq32ToVq8192)...")
249
+ local_dir = snapshot_download(
250
+ repo_id="amphion/Vevo",
251
+ repo_type="model",
252
+ cache_dir="./ckpts/Vevo",
253
+ allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
254
+ )
255
+ downloaded_ar_vq32_path = local_dir
256
+ downloaded_resources["ar_Vq32ToVq8192"] = True
257
+ print("Autoregressive Transformer (Vq32ToVq8192) download completed")
258
+
259
+ # Download Autoregressive Transformer (PhoneToVq8192)
260
+ if not downloaded_resources["ar_PhoneToVq8192"]:
261
+ print("Preloading Autoregressive Transformer (PhoneToVq8192)...")
262
+ local_dir = snapshot_download(
263
+ repo_id="amphion/Vevo",
264
+ repo_type="model",
265
+ cache_dir="./ckpts/Vevo",
266
+ allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
267
+ )
268
+ downloaded_ar_phone_path = local_dir
269
+ downloaded_resources["ar_PhoneToVq8192"] = True
270
+ print("Autoregressive Transformer (PhoneToVq8192) download completed")
271
+
272
+ # Download Flow Matching Transformer
273
+ if not downloaded_resources["fmt_Vq8192ToMels"]:
274
+ print("Preloading Flow Matching Transformer (Vq8192ToMels)...")
275
+ local_dir = snapshot_download(
276
+ repo_id="amphion/Vevo",
277
+ repo_type="model",
278
+ cache_dir="./ckpts/Vevo",
279
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
280
+ )
281
+ downloaded_fmt_path = local_dir
282
+ downloaded_resources["fmt_Vq8192ToMels"] = True
283
+ print("Flow Matching Transformer (Vq8192ToMels) download completed")
284
+
285
+ # Download Vocoder
286
+ if not downloaded_resources["vocoder"]:
287
+ print("Preloading Vocoder...")
288
+ local_dir = snapshot_download(
289
+ repo_id="amphion/Vevo",
290
+ repo_type="model",
291
+ cache_dir="./ckpts/Vevo",
292
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
293
+ )
294
+ downloaded_vocoder_path = local_dir
295
+ downloaded_resources["vocoder"] = True
296
+ print("Vocoder download completed")
297
+
298
+ print("All model resources preloading completed!")
299
+
300
+ # Initialize path variables to store downloaded model paths
301
+ downloaded_content_tokenizer_path = None
302
+ downloaded_content_style_tokenizer_path = None
303
+ downloaded_ar_vq32_path = None
304
+ downloaded_ar_phone_path = None
305
+ downloaded_fmt_path = None
306
+ downloaded_vocoder_path = None
307
+
308
+ # Preload all resources before creating the Gradio interface
309
+ preload_all_resources()
310
+
311
  def get_pipeline(pipeline_type):
312
  if pipeline_type in inference_pipelines:
313
  return inference_pipelines[pipeline_type]
314
 
315
  # Initialize pipeline based on the required pipeline type
316
  if pipeline_type == "style" or pipeline_type == "voice":
317
+ # Use already downloaded Content Tokenizer
318
+ if downloaded_resources["tokenizer_vq32"]:
319
+ content_tokenizer_ckpt_path = os.path.join(
320
+ downloaded_content_tokenizer_path, "tokenizer/vq32/hubert_large_l18_c32.pkl"
321
+ )
322
+ else:
323
+ # Fallback to direct download
324
  local_dir = snapshot_download(
325
  repo_id="amphion/Vevo",
326
  repo_type="model",
 
330
  content_tokenizer_ckpt_path = os.path.join(
331
  local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
332
  )
 
 
 
 
 
 
 
333
 
334
+ # Use already downloaded Content-Style Tokenizer
335
+ if downloaded_resources["tokenizer_vq8192"]:
336
+ content_style_tokenizer_ckpt_path = os.path.join(
337
+ downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
338
+ )
339
+ else:
340
+ # Fallback to direct download
341
  local_dir = snapshot_download(
342
  repo_id="amphion/Vevo",
343
  repo_type="model",
 
345
  allow_patterns=["tokenizer/vq8192/*"],
346
  )
347
  content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
 
 
 
 
 
 
 
348
 
349
+ # Use already downloaded Autoregressive Transformer
350
+ ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
351
+ if downloaded_resources["ar_Vq32ToVq8192"]:
352
+ ar_ckpt_path = os.path.join(
353
+ downloaded_ar_vq32_path, "contentstyle_modeling/Vq32ToVq8192"
354
+ )
355
+ else:
356
+ # Fallback to direct download
357
  local_dir = snapshot_download(
358
  repo_id="amphion/Vevo",
359
  repo_type="model",
360
  cache_dir="./ckpts/Vevo",
361
  allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
362
  )
 
363
  ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
 
 
 
 
 
 
 
 
364
 
365
+ # Use already downloaded Flow Matching Transformer
366
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
367
+ if downloaded_resources["fmt_Vq8192ToMels"]:
368
+ fmt_ckpt_path = os.path.join(
369
+ downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
370
+ )
371
+ else:
372
+ # Fallback to direct download
373
  local_dir = snapshot_download(
374
  repo_id="amphion/Vevo",
375
  repo_type="model",
376
  cache_dir="./ckpts/Vevo",
377
  allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
378
  )
 
379
  fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
 
 
 
 
 
 
 
 
380
 
381
+ # Use already downloaded Vocoder
382
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
383
+ if downloaded_resources["vocoder"]:
384
+ vocoder_ckpt_path = os.path.join(
385
+ downloaded_vocoder_path, "acoustic_modeling/Vocoder"
386
+ )
387
+ else:
388
+ # Fallback to direct download
389
  local_dir = snapshot_download(
390
  repo_id="amphion/Vevo",
391
  repo_type="model",
392
  cache_dir="./ckpts/Vevo",
393
  allow_patterns=["acoustic_modeling/Vocoder/*"],
394
  )
 
395
  vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
 
 
 
 
 
 
 
 
396
 
397
  # Initialize pipeline
398
  inference_pipeline = VevoInferencePipeline(
 
408
  )
409
 
410
  elif pipeline_type == "timbre":
411
+ # Use already downloaded Content-Style Tokenizer
412
+ if downloaded_resources["tokenizer_vq8192"]:
413
+ content_style_tokenizer_ckpt_path = os.path.join(
414
+ downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
415
+ )
416
+ else:
417
+ # Fallback to direct download
418
  local_dir = snapshot_download(
419
  repo_id="amphion/Vevo",
420
  repo_type="model",
 
422
  allow_patterns=["tokenizer/vq8192/*"],
423
  )
424
  content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
 
 
 
 
 
 
 
425
 
426
+ # Use already downloaded Flow Matching Transformer
427
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
428
+ if downloaded_resources["fmt_Vq8192ToMels"]:
429
+ fmt_ckpt_path = os.path.join(
430
+ downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
431
+ )
432
+ else:
433
+ # Fallback to direct download
434
  local_dir = snapshot_download(
435
  repo_id="amphion/Vevo",
436
  repo_type="model",
437
  cache_dir="./ckpts/Vevo",
438
  allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
439
  )
 
440
  fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
 
 
 
 
 
 
 
 
441
 
442
+ # Use already downloaded Vocoder
443
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
444
+ if downloaded_resources["vocoder"]:
445
+ vocoder_ckpt_path = os.path.join(
446
+ downloaded_vocoder_path, "acoustic_modeling/Vocoder"
447
+ )
448
+ else:
449
+ # Fallback to direct download
450
  local_dir = snapshot_download(
451
  repo_id="amphion/Vevo",
452
  repo_type="model",
453
  cache_dir="./ckpts/Vevo",
454
  allow_patterns=["acoustic_modeling/Vocoder/*"],
455
  )
 
456
  vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
 
 
 
 
 
 
 
 
457
 
458
  # Initialize pipeline
459
  inference_pipeline = VevoInferencePipeline(
 
466
  )
467
 
468
  elif pipeline_type == "tts":
469
+ # Use already downloaded Content-Style Tokenizer
470
+ if downloaded_resources["tokenizer_vq8192"]:
471
+ content_style_tokenizer_ckpt_path = os.path.join(
472
+ downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
473
+ )
474
+ else:
475
+ # Fallback to direct download
476
  local_dir = snapshot_download(
477
  repo_id="amphion/Vevo",
478
  repo_type="model",
 
480
  allow_patterns=["tokenizer/vq8192/*"],
481
  )
482
  content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
 
 
 
 
 
 
 
483
 
484
+ # Use already downloaded Autoregressive Transformer (TTS specific)
485
+ ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
486
+ if downloaded_resources["ar_PhoneToVq8192"]:
487
+ ar_ckpt_path = os.path.join(
488
+ downloaded_ar_phone_path, "contentstyle_modeling/PhoneToVq8192"
489
+ )
490
+ else:
491
+ # Fallback to direct download
492
  local_dir = snapshot_download(
493
  repo_id="amphion/Vevo",
494
  repo_type="model",
495
  cache_dir="./ckpts/Vevo",
496
  allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
497
  )
 
498
  ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
 
 
 
 
 
 
 
 
499
 
500
+ # Use already downloaded Flow Matching Transformer
501
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
502
+ if downloaded_resources["fmt_Vq8192ToMels"]:
503
+ fmt_ckpt_path = os.path.join(
504
+ downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
505
+ )
506
+ else:
507
+ # Fallback to direct download
508
  local_dir = snapshot_download(
509
  repo_id="amphion/Vevo",
510
  repo_type="model",
511
  cache_dir="./ckpts/Vevo",
512
  allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
513
  )
 
514
  fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
 
 
 
 
 
 
 
 
515
 
516
+ # Use already downloaded Vocoder
517
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
518
+ if downloaded_resources["vocoder"]:
519
+ vocoder_ckpt_path = os.path.join(
520
+ downloaded_vocoder_path, "acoustic_modeling/Vocoder"
521
+ )
522
+ else:
523
+ # Fallback to direct download
524
  local_dir = snapshot_download(
525
  repo_id="amphion/Vevo",
526
  repo_type="model",
527
  cache_dir="./ckpts/Vevo",
528
  allow_patterns=["acoustic_modeling/Vocoder/*"],
529
  )
 
530
  vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
 
 
 
 
 
 
 
 
531
 
532
  # Initialize pipeline
533
  inference_pipeline = VevoInferencePipeline(
 
956
  traceback.print_exc()
957
  raise e
958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959
  # Create Gradio interface
960
  with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
961
  gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")