Upload 6 files
Browse files- bigvgan_generator.pth +3 -0
- bpe.model +3 -0
- config.yaml +112 -0
- dvae.pth +3 -0
- gpt.pth +3 -0
- unigram_12000.vocab +0 -0
bigvgan_generator.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9ec77084929fad053355669c8b5986e32542f13afeff78ad93389a8f06ce62b0
|
3 |
+
size 525166944
|
bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf30028855ff4a89f6663325c88b44a69f74f97990dd410a4b35414c4db31779
|
3 |
+
size 476049
|
config.yaml
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset:
|
2 |
+
bpe_model: bpe.model
|
3 |
+
sample_rate: 24000
|
4 |
+
squeeze: false
|
5 |
+
mel:
|
6 |
+
sample_rate: 24000
|
7 |
+
n_fft: 1024
|
8 |
+
hop_length: 256
|
9 |
+
win_length: 1024
|
10 |
+
n_mels: 100
|
11 |
+
mel_fmin: 0
|
12 |
+
normalize: false
|
13 |
+
|
14 |
+
gpt:
|
15 |
+
model_dim: 1024
|
16 |
+
max_mel_tokens: 605
|
17 |
+
max_text_tokens: 402
|
18 |
+
heads: 16
|
19 |
+
use_mel_codes_as_input: true
|
20 |
+
mel_length_compression: 1024
|
21 |
+
layers: 20
|
22 |
+
number_text_tokens: 12000
|
23 |
+
number_mel_codes: 8194
|
24 |
+
start_mel_token: 8192
|
25 |
+
stop_mel_token: 8193
|
26 |
+
start_text_token: 0
|
27 |
+
stop_text_token: 1
|
28 |
+
train_solo_embeddings: false
|
29 |
+
condition_type: "conformer_perceiver"
|
30 |
+
condition_module:
|
31 |
+
output_size: 512
|
32 |
+
linear_units: 2048
|
33 |
+
attention_heads: 8
|
34 |
+
num_blocks: 6
|
35 |
+
input_layer: "conv2d2"
|
36 |
+
perceiver_mult: 2
|
37 |
+
|
38 |
+
vqvae:
|
39 |
+
channels: 100
|
40 |
+
num_tokens: 8192
|
41 |
+
hidden_dim: 512
|
42 |
+
num_resnet_blocks: 3
|
43 |
+
codebook_dim: 512
|
44 |
+
num_layers: 2
|
45 |
+
positional_dims: 1
|
46 |
+
kernel_size: 3
|
47 |
+
smooth_l1_loss: true
|
48 |
+
use_transposed_convs: false
|
49 |
+
|
50 |
+
bigvgan:
|
51 |
+
adam_b1: 0.8
|
52 |
+
adam_b2: 0.99
|
53 |
+
lr_decay: 0.999998
|
54 |
+
seed: 1234
|
55 |
+
|
56 |
+
resblock: "1"
|
57 |
+
upsample_rates: [4,4,4,4,2,2]
|
58 |
+
upsample_kernel_sizes: [8,8,4,4,4,4]
|
59 |
+
upsample_initial_channel: 1536
|
60 |
+
resblock_kernel_sizes: [3,7,11]
|
61 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
62 |
+
feat_upsample: false
|
63 |
+
speaker_embedding_dim: 512
|
64 |
+
cond_d_vector_in_each_upsampling_layer: true
|
65 |
+
|
66 |
+
gpt_dim: 1024
|
67 |
+
|
68 |
+
activation: "snakebeta"
|
69 |
+
snake_logscale: true
|
70 |
+
|
71 |
+
use_cqtd_instead_of_mrd: true
|
72 |
+
cqtd_filters: 128
|
73 |
+
cqtd_max_filters: 1024
|
74 |
+
cqtd_filters_scale: 1
|
75 |
+
cqtd_dilations: [1, 2, 4]
|
76 |
+
cqtd_hop_lengths: [512, 256, 256]
|
77 |
+
cqtd_n_octaves: [9, 9, 9]
|
78 |
+
cqtd_bins_per_octaves: [24, 36, 48]
|
79 |
+
|
80 |
+
resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]]
|
81 |
+
mpd_reshapes: [2, 3, 5, 7, 11]
|
82 |
+
use_spectral_norm: false
|
83 |
+
discriminator_channel_mult: 1
|
84 |
+
|
85 |
+
use_multiscale_melloss: true
|
86 |
+
lambda_melloss: 15
|
87 |
+
|
88 |
+
clip_grad_norm: 1000
|
89 |
+
|
90 |
+
segment_size: 16384
|
91 |
+
num_mels: 100
|
92 |
+
num_freq: 1025
|
93 |
+
n_fft: 1024
|
94 |
+
hop_size: 256
|
95 |
+
win_size: 1024
|
96 |
+
|
97 |
+
sampling_rate: 24000
|
98 |
+
|
99 |
+
fmin: 0
|
100 |
+
fmax: null
|
101 |
+
fmax_for_loss: null
|
102 |
+
mel_type: "pytorch"
|
103 |
+
|
104 |
+
num_workers: 2
|
105 |
+
dist_config:
|
106 |
+
dist_backend: "nccl"
|
107 |
+
dist_url: "tcp://localhost:54321"
|
108 |
+
world_size: 1
|
109 |
+
|
110 |
+
dvae_checkpoint: dvae.pth
|
111 |
+
gpt_checkpoint: gpt.pth
|
112 |
+
bigvgan_checkpoint: bigvgan_generator.pth
|
dvae.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c112404dfe25d8d88084b507b0637037a419b4a5a0d9160516d9398a8f2b52c8
|
3 |
+
size 243316270
|
gpt.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7797ed691d9c0295fd30af153d9ff04501e353a4c67c3f898e4b0840a5ef10dd
|
3 |
+
size 696529044
|
unigram_12000.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|