224 lines
5.7 KiB
YAML
224 lines
5.7 KiB
YAML
tts_version: "v1.5.0"
|
|
|
|
split: "opensource-en"
|
|
|
|
ttl_ckpt_path: "unknown.pt"
|
|
|
|
dp_ckpt_path: "unknown.pt"
|
|
|
|
ae_ckpt_path: "unknown.pt"
|
|
|
|
ttl_train: "unknown"
|
|
|
|
dp_train: "unknown"
|
|
|
|
ae_train: "unknown"
|
|
|
|
ttl:
|
|
latent_dim: 24
|
|
chunk_compress_factor: 6
|
|
batch_expander:
|
|
n_batch_expand: 6
|
|
normalizer:
|
|
scale: 0.25
|
|
text_encoder:
|
|
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
|
text_embedder:
|
|
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
|
char_emb_dim: 256
|
|
convnext:
|
|
idim: 256
|
|
ksz: 5
|
|
intermediate_dim: 1024
|
|
num_layers: 6
|
|
dilation_lst: [1, 1, 1, 1, 1, 1]
|
|
attn_encoder:
|
|
hidden_channels: 256
|
|
filter_channels: 1024
|
|
n_heads: 4
|
|
n_layers: 4
|
|
p_dropout: 0.0
|
|
proj_out:
|
|
idim: 256
|
|
odim: 256
|
|
flow_matching:
|
|
sig_min: 0
|
|
style_encoder:
|
|
proj_in:
|
|
ldim: 24
|
|
chunk_compress_factor: 6
|
|
odim: 256
|
|
convnext:
|
|
idim: 256
|
|
ksz: 5
|
|
intermediate_dim: 1024
|
|
num_layers: 6
|
|
dilation_lst: [1, 1, 1, 1, 1, 1]
|
|
style_token_layer:
|
|
input_dim: 256
|
|
n_style: 50
|
|
style_key_dim: 256
|
|
style_value_dim: 256
|
|
prototype_dim: 256
|
|
n_units: 256
|
|
n_heads: 2
|
|
speech_prompted_text_encoder:
|
|
text_dim: 256
|
|
style_dim: 256
|
|
n_units: 256
|
|
n_heads: 2
|
|
uncond_masker:
|
|
prob_both_uncond: 0.04
|
|
prob_text_uncond: 0.01
|
|
std: 0.1
|
|
text_dim: 256
|
|
n_style: 50
|
|
style_key_dim: 256
|
|
style_value_dim: 256
|
|
vector_field:
|
|
proj_in:
|
|
ldim: 24
|
|
chunk_compress_factor: 6
|
|
odim: 512
|
|
time_encoder:
|
|
time_dim: 64
|
|
hdim: 256
|
|
main_blocks:
|
|
n_blocks: 4
|
|
time_cond_layer:
|
|
idim: 512
|
|
time_dim: 64
|
|
style_cond_layer:
|
|
idim: 512
|
|
style_dim: 256
|
|
text_cond_layer:
|
|
idim: 512
|
|
text_dim: 256
|
|
n_heads: 4
|
|
use_residual: True
|
|
rotary_base: 10000
|
|
rotary_scale: 10
|
|
convnext_0:
|
|
idim: 512
|
|
ksz: 5
|
|
intermediate_dim: 1024
|
|
num_layers: 4
|
|
dilation_lst: [1, 2, 4, 8]
|
|
convnext_1:
|
|
idim: 512
|
|
ksz: 5
|
|
intermediate_dim: 1024
|
|
num_layers: 1
|
|
dilation_lst: [1]
|
|
convnext_2:
|
|
idim: 512
|
|
ksz: 5
|
|
intermediate_dim: 1024
|
|
num_layers: 1
|
|
dilation_lst: [1]
|
|
last_convnext:
|
|
idim: 512
|
|
ksz: 5
|
|
intermediate_dim: 1024
|
|
num_layers: 4
|
|
dilation_lst: [1, 1, 1, 1]
|
|
proj_out:
|
|
idim: 512
|
|
chunk_compress_factor: 6
|
|
ldim: 24
|
|
|
|
ae:
|
|
sample_rate: 44100
|
|
n_delay: 0
|
|
base_chunk_size: 512
|
|
chunk_compress_factor: 1
|
|
ldim: 24
|
|
encoder:
|
|
spec_processor:
|
|
n_fft: 2048
|
|
win_length: 2048
|
|
hop_length: 512
|
|
n_mels: 228
|
|
sample_rate: 44100
|
|
eps: 1e-05
|
|
norm_mean: 0.0
|
|
norm_std: 1.0
|
|
ksz_init: 7
|
|
ksz: 7
|
|
num_layers: 10
|
|
dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
intermediate_dim: 2048
|
|
idim: 1253
|
|
hdim: 512
|
|
odim: 24
|
|
decoder:
|
|
ksz_init: 7
|
|
ksz: 7
|
|
num_layers: 10
|
|
dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
|
|
intermediate_dim: 2048
|
|
idim: 24
|
|
hdim: 512
|
|
head:
|
|
idim: 512
|
|
hdim: 2048
|
|
odim: 512
|
|
ksz: 3
|
|
|
|
dp:
|
|
latent_dim: 24
|
|
chunk_compress_factor: 6
|
|
normalizer:
|
|
scale: 1.0
|
|
sentence_encoder:
|
|
char_emb_dim: 64
|
|
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
|
text_embedder:
|
|
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
|
char_emb_dim: 64
|
|
convnext:
|
|
idim: 64
|
|
ksz: 5
|
|
intermediate_dim: 256
|
|
num_layers: 6
|
|
dilation_lst: [1, 1, 1, 1, 1, 1]
|
|
attn_encoder:
|
|
hidden_channels: 64
|
|
filter_channels: 256
|
|
n_heads: 2
|
|
n_layers: 2
|
|
p_dropout: 0.0
|
|
proj_out:
|
|
idim: 64
|
|
odim: 64
|
|
style_encoder:
|
|
proj_in:
|
|
ldim: 24
|
|
chunk_compress_factor: 6
|
|
odim: 64
|
|
convnext:
|
|
idim: 64
|
|
ksz: 5
|
|
intermediate_dim: 256
|
|
num_layers: 4
|
|
dilation_lst: [1, 1, 1, 1]
|
|
style_token_layer:
|
|
input_dim: 64
|
|
n_style: 8
|
|
style_key_dim: 0
|
|
style_value_dim: 16
|
|
prototype_dim: 64
|
|
n_units: 64
|
|
n_heads: 2
|
|
predictor:
|
|
sentence_dim: 64
|
|
n_style: 8
|
|
style_dim: 16
|
|
hdim: 128
|
|
n_layer: 2
|
|
|
|
unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
|
|
unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
|
|
window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
|
|
filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"
|