add supertonic

This commit is contained in:
backuppc
2025-12-10 11:39:15 +09:00
parent 3695ab0044
commit 868fa2deec
22 changed files with 54223 additions and 1 deletions

View File

@@ -0,0 +1,316 @@
{
"tts_version": "v1.5.0",
"split": "opensource-en",
"ttl_ckpt_path": "unknown.pt",
"dp_ckpt_path": "unknown.pt",
"ae_ckpt_path": "unknown.pt",
"ttl_train": "unknown",
"dp_train": "unknown",
"ae_train": "unknown",
"ttl": {
"latent_dim": 24,
"chunk_compress_factor": 6,
"batch_expander": {
"n_batch_expand": 6
},
"normalizer": {
"scale": 0.25
},
"text_encoder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"text_embedder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"char_emb_dim": 256
},
"convnext": {
"idim": 256,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"attn_encoder": {
"hidden_channels": 256,
"filter_channels": 1024,
"n_heads": 4,
"n_layers": 4,
"p_dropout": 0.0
},
"proj_out": {
"idim": 256,
"odim": 256
}
},
"flow_matching": {
"sig_min": 0
},
"style_encoder": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 256
},
"convnext": {
"idim": 256,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"style_token_layer": {
"input_dim": 256,
"n_style": 50,
"style_key_dim": 256,
"style_value_dim": 256,
"prototype_dim": 256,
"n_units": 256,
"n_heads": 2
}
},
"speech_prompted_text_encoder": {
"text_dim": 256,
"style_dim": 256,
"n_units": 256,
"n_heads": 2
},
"uncond_masker": {
"prob_both_uncond": 0.04,
"prob_text_uncond": 0.01,
"std": 0.1,
"text_dim": 256,
"n_style": 50,
"style_key_dim": 256,
"style_value_dim": 256
},
"vector_field": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 512
},
"time_encoder": {
"time_dim": 64,
"hdim": 256
},
"main_blocks": {
"n_blocks": 4,
"time_cond_layer": {
"idim": 512,
"time_dim": 64
},
"style_cond_layer": {
"idim": 512,
"style_dim": 256
},
"text_cond_layer": {
"idim": 512,
"text_dim": 256,
"n_heads": 4,
"use_residual": true,
"rotary_base": 10000,
"rotary_scale": 10
},
"convnext_0": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 4,
"dilation_lst": [
1,
2,
4,
8
]
},
"convnext_1": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 1,
"dilation_lst": [
1
]
},
"convnext_2": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 1,
"dilation_lst": [
1
]
}
},
"last_convnext": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 4,
"dilation_lst": [
1,
1,
1,
1
]
},
"proj_out": {
"idim": 512,
"chunk_compress_factor": 6,
"ldim": 24
}
}
},
"ae": {
"sample_rate": 44100,
"n_delay": 0,
"base_chunk_size": 512,
"chunk_compress_factor": 1,
"ldim": 24,
"encoder": {
"spec_processor": {
"n_fft": 2048,
"win_length": 2048,
"hop_length": 512,
"n_mels": 228,
"sample_rate": 44100,
"eps": 1e-05,
"norm_mean": 0.0,
"norm_std": 1.0
},
"ksz_init": 7,
"ksz": 7,
"num_layers": 10,
"dilation_lst": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1
],
"intermediate_dim": 2048,
"idim": 1253,
"hdim": 512,
"odim": 24
},
"decoder": {
"ksz_init": 7,
"ksz": 7,
"num_layers": 10,
"dilation_lst": [
1,
2,
4,
1,
2,
4,
1,
1,
1,
1
],
"intermediate_dim": 2048,
"idim": 24,
"hdim": 512,
"head": {
"idim": 512,
"hdim": 2048,
"odim": 512,
"ksz": 3
}
}
},
"dp": {
"latent_dim": 24,
"chunk_compress_factor": 6,
"normalizer": {
"scale": 1.0
},
"sentence_encoder": {
"char_emb_dim": 64,
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"text_embedder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"char_emb_dim": 64
},
"convnext": {
"idim": 64,
"ksz": 5,
"intermediate_dim": 256,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"attn_encoder": {
"hidden_channels": 64,
"filter_channels": 256,
"n_heads": 2,
"n_layers": 2,
"p_dropout": 0.0
},
"proj_out": {
"idim": 64,
"odim": 64
}
},
"style_encoder": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 64
},
"convnext": {
"idim": 64,
"ksz": 5,
"intermediate_dim": 256,
"num_layers": 4,
"dilation_lst": [
1,
1,
1,
1
]
},
"style_token_layer": {
"input_dim": 64,
"n_style": 8,
"style_key_dim": 0,
"style_value_dim": 16,
"prototype_dim": 64,
"n_units": 64,
"n_heads": 2
}
},
"predictor": {
"sentence_dim": 64,
"n_style": 8,
"style_dim": 16,
"hdim": 128,
"n_layer": 2
}
}
}

View File

@@ -0,0 +1,223 @@
tts_version: "v1.5.0"
split: "opensource-en"
ttl_ckpt_path: "unknown.pt"
dp_ckpt_path: "unknown.pt"
ae_ckpt_path: "unknown.pt"
ttl_train: "unknown"
dp_train: "unknown"
ae_train: "unknown"
ttl:
latent_dim: 24
chunk_compress_factor: 6
batch_expander:
n_batch_expand: 6
normalizer:
scale: 0.25
text_encoder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
text_embedder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
char_emb_dim: 256
convnext:
idim: 256
ksz: 5
intermediate_dim: 1024
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
attn_encoder:
hidden_channels: 256
filter_channels: 1024
n_heads: 4
n_layers: 4
p_dropout: 0.0
proj_out:
idim: 256
odim: 256
flow_matching:
sig_min: 0
style_encoder:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 256
convnext:
idim: 256
ksz: 5
intermediate_dim: 1024
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
style_token_layer:
input_dim: 256
n_style: 50
style_key_dim: 256
style_value_dim: 256
prototype_dim: 256
n_units: 256
n_heads: 2
speech_prompted_text_encoder:
text_dim: 256
style_dim: 256
n_units: 256
n_heads: 2
uncond_masker:
prob_both_uncond: 0.04
prob_text_uncond: 0.01
std: 0.1
text_dim: 256
n_style: 50
style_key_dim: 256
style_value_dim: 256
vector_field:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 512
time_encoder:
time_dim: 64
hdim: 256
main_blocks:
n_blocks: 4
time_cond_layer:
idim: 512
time_dim: 64
style_cond_layer:
idim: 512
style_dim: 256
text_cond_layer:
idim: 512
text_dim: 256
n_heads: 4
use_residual: True
rotary_base: 10000
rotary_scale: 10
convnext_0:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 4
dilation_lst: [1, 2, 4, 8]
convnext_1:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 1
dilation_lst: [1]
convnext_2:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 1
dilation_lst: [1]
last_convnext:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 4
dilation_lst: [1, 1, 1, 1]
proj_out:
idim: 512
chunk_compress_factor: 6
ldim: 24
ae:
sample_rate: 44100
n_delay: 0
base_chunk_size: 512
chunk_compress_factor: 1
ldim: 24
encoder:
spec_processor:
n_fft: 2048
win_length: 2048
hop_length: 512
n_mels: 228
sample_rate: 44100
eps: 1e-05
norm_mean: 0.0
norm_std: 1.0
ksz_init: 7
ksz: 7
num_layers: 10
dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
intermediate_dim: 2048
idim: 1253
hdim: 512
odim: 24
decoder:
ksz_init: 7
ksz: 7
num_layers: 10
dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
intermediate_dim: 2048
idim: 24
hdim: 512
head:
idim: 512
hdim: 2048
odim: 512
ksz: 3
dp:
latent_dim: 24
chunk_compress_factor: 6
normalizer:
scale: 1.0
sentence_encoder:
char_emb_dim: 64
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
text_embedder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
char_emb_dim: 64
convnext:
idim: 64
ksz: 5
intermediate_dim: 256
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
attn_encoder:
hidden_channels: 64
filter_channels: 256
n_heads: 2
n_layers: 2
p_dropout: 0.0
proj_out:
idim: 64
odim: 64
style_encoder:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 64
convnext:
idim: 64
ksz: 5
intermediate_dim: 256
num_layers: 4
dilation_lst: [1, 1, 1, 1]
style_token_layer:
input_dim: 64
n_style: 8
style_key_dim: 0
style_value_dim: 16
prototype_dim: 64
n_units: 64
n_heads: 2
predictor:
sentence_dim: 64
n_style: 8
style_dim: 16
hdim: 128
n_layer: 2
unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"

File diff suppressed because one or more lines are too long

Binary file not shown.