Text-to-Speech
index-tts
index-tts commited on
Commit
359633a
·
verified ·
1 Parent(s): 1ede383

Upload 6 files

Browse files
Files changed (6) hide show
  1. bigvgan_generator.pth +3 -0
  2. bpe.model +3 -0
  3. config.yaml +112 -0
  4. dvae.pth +3 -0
  5. gpt.pth +3 -0
  6. unigram_12000.vocab +0 -0
bigvgan_generator.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ec77084929fad053355669c8b5986e32542f13afeff78ad93389a8f06ce62b0
3
+ size 525166944
bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf30028855ff4a89f6663325c88b44a69f74f97990dd410a4b35414c4db31779
3
+ size 476049
config.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ bpe_model: bpe.model
3
+ sample_rate: 24000
4
+ squeeze: false
5
+ mel:
6
+ sample_rate: 24000
7
+ n_fft: 1024
8
+ hop_length: 256
9
+ win_length: 1024
10
+ n_mels: 100
11
+ mel_fmin: 0
12
+ normalize: false
13
+
14
+ gpt:
15
+ model_dim: 1024
16
+ max_mel_tokens: 605
17
+ max_text_tokens: 402
18
+ heads: 16
19
+ use_mel_codes_as_input: true
20
+ mel_length_compression: 1024
21
+ layers: 20
22
+ number_text_tokens: 12000
23
+ number_mel_codes: 8194
24
+ start_mel_token: 8192
25
+ stop_mel_token: 8193
26
+ start_text_token: 0
27
+ stop_text_token: 1
28
+ train_solo_embeddings: false
29
+ condition_type: "conformer_perceiver"
30
+ condition_module:
31
+ output_size: 512
32
+ linear_units: 2048
33
+ attention_heads: 8
34
+ num_blocks: 6
35
+ input_layer: "conv2d2"
36
+ perceiver_mult: 2
37
+
38
+ vqvae:
39
+ channels: 100
40
+ num_tokens: 8192
41
+ hidden_dim: 512
42
+ num_resnet_blocks: 3
43
+ codebook_dim: 512
44
+ num_layers: 2
45
+ positional_dims: 1
46
+ kernel_size: 3
47
+ smooth_l1_loss: true
48
+ use_transposed_convs: false
49
+
50
+ bigvgan:
51
+ adam_b1: 0.8
52
+ adam_b2: 0.99
53
+ lr_decay: 0.999998
54
+ seed: 1234
55
+
56
+ resblock: "1"
57
+ upsample_rates: [4,4,4,4,2,2]
58
+ upsample_kernel_sizes: [8,8,4,4,4,4]
59
+ upsample_initial_channel: 1536
60
+ resblock_kernel_sizes: [3,7,11]
61
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
62
+ feat_upsample: false
63
+ speaker_embedding_dim: 512
64
+ cond_d_vector_in_each_upsampling_layer: true
65
+
66
+ gpt_dim: 1024
67
+
68
+ activation: "snakebeta"
69
+ snake_logscale: true
70
+
71
+ use_cqtd_instead_of_mrd: true
72
+ cqtd_filters: 128
73
+ cqtd_max_filters: 1024
74
+ cqtd_filters_scale: 1
75
+ cqtd_dilations: [1, 2, 4]
76
+ cqtd_hop_lengths: [512, 256, 256]
77
+ cqtd_n_octaves: [9, 9, 9]
78
+ cqtd_bins_per_octaves: [24, 36, 48]
79
+
80
+ resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]]
81
+ mpd_reshapes: [2, 3, 5, 7, 11]
82
+ use_spectral_norm: false
83
+ discriminator_channel_mult: 1
84
+
85
+ use_multiscale_melloss: true
86
+ lambda_melloss: 15
87
+
88
+ clip_grad_norm: 1000
89
+
90
+ segment_size: 16384
91
+ num_mels: 100
92
+ num_freq: 1025
93
+ n_fft: 1024
94
+ hop_size: 256
95
+ win_size: 1024
96
+
97
+ sampling_rate: 24000
98
+
99
+ fmin: 0
100
+ fmax: null
101
+ fmax_for_loss: null
102
+ mel_type: "pytorch"
103
+
104
+ num_workers: 2
105
+ dist_config:
106
+ dist_backend: "nccl"
107
+ dist_url: "tcp://localhost:54321"
108
+ world_size: 1
109
+
110
+ dvae_checkpoint: dvae.pth
111
+ gpt_checkpoint: gpt.pth
112
+ bigvgan_checkpoint: bigvgan_generator.pth
dvae.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c112404dfe25d8d88084b507b0637037a419b4a5a0d9160516d9398a8f2b52c8
3
+ size 243316270
gpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7797ed691d9c0295fd30af153d9ff04501e353a4c67c3f898e4b0840a5ef10dd
3
+ size 696529044
unigram_12000.vocab ADDED
The diff for this file is too large to render. See raw diff