316 lines
8.4 KiB
JSON
316 lines
8.4 KiB
JSON
{
|
|
"tts_version": "v1.5.0",
|
|
"split": "opensource-en",
|
|
"ttl_ckpt_path": "unknown.pt",
|
|
"dp_ckpt_path": "unknown.pt",
|
|
"ae_ckpt_path": "unknown.pt",
|
|
"ttl_train": "unknown",
|
|
"dp_train": "unknown",
|
|
"ae_train": "unknown",
|
|
"ttl": {
|
|
"latent_dim": 24,
|
|
"chunk_compress_factor": 6,
|
|
"batch_expander": {
|
|
"n_batch_expand": 6
|
|
},
|
|
"normalizer": {
|
|
"scale": 0.25
|
|
},
|
|
"text_encoder": {
|
|
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
|
"text_embedder": {
|
|
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
|
"char_emb_dim": 256
|
|
},
|
|
"convnext": {
|
|
"idim": 256,
|
|
"ksz": 5,
|
|
"intermediate_dim": 1024,
|
|
"num_layers": 6,
|
|
"dilation_lst": [
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1
|
|
]
|
|
},
|
|
"attn_encoder": {
|
|
"hidden_channels": 256,
|
|
"filter_channels": 1024,
|
|
"n_heads": 4,
|
|
"n_layers": 4,
|
|
"p_dropout": 0.0
|
|
},
|
|
"proj_out": {
|
|
"idim": 256,
|
|
"odim": 256
|
|
}
|
|
},
|
|
"flow_matching": {
|
|
"sig_min": 0
|
|
},
|
|
"style_encoder": {
|
|
"proj_in": {
|
|
"ldim": 24,
|
|
"chunk_compress_factor": 6,
|
|
"odim": 256
|
|
},
|
|
"convnext": {
|
|
"idim": 256,
|
|
"ksz": 5,
|
|
"intermediate_dim": 1024,
|
|
"num_layers": 6,
|
|
"dilation_lst": [
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1
|
|
]
|
|
},
|
|
"style_token_layer": {
|
|
"input_dim": 256,
|
|
"n_style": 50,
|
|
"style_key_dim": 256,
|
|
"style_value_dim": 256,
|
|
"prototype_dim": 256,
|
|
"n_units": 256,
|
|
"n_heads": 2
|
|
}
|
|
},
|
|
"speech_prompted_text_encoder": {
|
|
"text_dim": 256,
|
|
"style_dim": 256,
|
|
"n_units": 256,
|
|
"n_heads": 2
|
|
},
|
|
"uncond_masker": {
|
|
"prob_both_uncond": 0.04,
|
|
"prob_text_uncond": 0.01,
|
|
"std": 0.1,
|
|
"text_dim": 256,
|
|
"n_style": 50,
|
|
"style_key_dim": 256,
|
|
"style_value_dim": 256
|
|
},
|
|
"vector_field": {
|
|
"proj_in": {
|
|
"ldim": 24,
|
|
"chunk_compress_factor": 6,
|
|
"odim": 512
|
|
},
|
|
"time_encoder": {
|
|
"time_dim": 64,
|
|
"hdim": 256
|
|
},
|
|
"main_blocks": {
|
|
"n_blocks": 4,
|
|
"time_cond_layer": {
|
|
"idim": 512,
|
|
"time_dim": 64
|
|
},
|
|
"style_cond_layer": {
|
|
"idim": 512,
|
|
"style_dim": 256
|
|
},
|
|
"text_cond_layer": {
|
|
"idim": 512,
|
|
"text_dim": 256,
|
|
"n_heads": 4,
|
|
"use_residual": true,
|
|
"rotary_base": 10000,
|
|
"rotary_scale": 10
|
|
},
|
|
"convnext_0": {
|
|
"idim": 512,
|
|
"ksz": 5,
|
|
"intermediate_dim": 1024,
|
|
"num_layers": 4,
|
|
"dilation_lst": [
|
|
1,
|
|
2,
|
|
4,
|
|
8
|
|
]
|
|
},
|
|
"convnext_1": {
|
|
"idim": 512,
|
|
"ksz": 5,
|
|
"intermediate_dim": 1024,
|
|
"num_layers": 1,
|
|
"dilation_lst": [
|
|
1
|
|
]
|
|
},
|
|
"convnext_2": {
|
|
"idim": 512,
|
|
"ksz": 5,
|
|
"intermediate_dim": 1024,
|
|
"num_layers": 1,
|
|
"dilation_lst": [
|
|
1
|
|
]
|
|
}
|
|
},
|
|
"last_convnext": {
|
|
"idim": 512,
|
|
"ksz": 5,
|
|
"intermediate_dim": 1024,
|
|
"num_layers": 4,
|
|
"dilation_lst": [
|
|
1,
|
|
1,
|
|
1,
|
|
1
|
|
]
|
|
},
|
|
"proj_out": {
|
|
"idim": 512,
|
|
"chunk_compress_factor": 6,
|
|
"ldim": 24
|
|
}
|
|
}
|
|
},
|
|
"ae": {
|
|
"sample_rate": 44100,
|
|
"n_delay": 0,
|
|
"base_chunk_size": 512,
|
|
"chunk_compress_factor": 1,
|
|
"ldim": 24,
|
|
"encoder": {
|
|
"spec_processor": {
|
|
"n_fft": 2048,
|
|
"win_length": 2048,
|
|
"hop_length": 512,
|
|
"n_mels": 228,
|
|
"sample_rate": 44100,
|
|
"eps": 1e-05,
|
|
"norm_mean": 0.0,
|
|
"norm_std": 1.0
|
|
},
|
|
"ksz_init": 7,
|
|
"ksz": 7,
|
|
"num_layers": 10,
|
|
"dilation_lst": [
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1
|
|
],
|
|
"intermediate_dim": 2048,
|
|
"idim": 1253,
|
|
"hdim": 512,
|
|
"odim": 24
|
|
},
|
|
"decoder": {
|
|
"ksz_init": 7,
|
|
"ksz": 7,
|
|
"num_layers": 10,
|
|
"dilation_lst": [
|
|
1,
|
|
2,
|
|
4,
|
|
1,
|
|
2,
|
|
4,
|
|
1,
|
|
1,
|
|
1,
|
|
1
|
|
],
|
|
"intermediate_dim": 2048,
|
|
"idim": 24,
|
|
"hdim": 512,
|
|
"head": {
|
|
"idim": 512,
|
|
"hdim": 2048,
|
|
"odim": 512,
|
|
"ksz": 3
|
|
}
|
|
}
|
|
},
|
|
"dp": {
|
|
"latent_dim": 24,
|
|
"chunk_compress_factor": 6,
|
|
"normalizer": {
|
|
"scale": 1.0
|
|
},
|
|
"sentence_encoder": {
|
|
"char_emb_dim": 64,
|
|
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
|
"text_embedder": {
|
|
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
|
"char_emb_dim": 64
|
|
},
|
|
"convnext": {
|
|
"idim": 64,
|
|
"ksz": 5,
|
|
"intermediate_dim": 256,
|
|
"num_layers": 6,
|
|
"dilation_lst": [
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1,
|
|
1
|
|
]
|
|
},
|
|
"attn_encoder": {
|
|
"hidden_channels": 64,
|
|
"filter_channels": 256,
|
|
"n_heads": 2,
|
|
"n_layers": 2,
|
|
"p_dropout": 0.0
|
|
},
|
|
"proj_out": {
|
|
"idim": 64,
|
|
"odim": 64
|
|
}
|
|
},
|
|
"style_encoder": {
|
|
"proj_in": {
|
|
"ldim": 24,
|
|
"chunk_compress_factor": 6,
|
|
"odim": 64
|
|
},
|
|
"convnext": {
|
|
"idim": 64,
|
|
"ksz": 5,
|
|
"intermediate_dim": 256,
|
|
"num_layers": 4,
|
|
"dilation_lst": [
|
|
1,
|
|
1,
|
|
1,
|
|
1
|
|
]
|
|
},
|
|
"style_token_layer": {
|
|
"input_dim": 64,
|
|
"n_style": 8,
|
|
"style_key_dim": 0,
|
|
"style_value_dim": 16,
|
|
"prototype_dim": 64,
|
|
"n_units": 64,
|
|
"n_heads": 2
|
|
}
|
|
},
|
|
"predictor": {
|
|
"sentence_dim": 64,
|
|
"n_style": 8,
|
|
"style_dim": 16,
|
|
"hdim": 128,
|
|
"n_layer": 2
|
|
}
|
|
}
|
|
} |