add supertonic

This commit is contained in:
backuppc
2025-12-10 11:39:15 +09:00
parent 3695ab0044
commit 868fa2deec
22 changed files with 54223 additions and 1 deletions

View File

@@ -0,0 +1,316 @@
{
"tts_version": "v1.5.0",
"split": "opensource-en",
"ttl_ckpt_path": "unknown.pt",
"dp_ckpt_path": "unknown.pt",
"ae_ckpt_path": "unknown.pt",
"ttl_train": "unknown",
"dp_train": "unknown",
"ae_train": "unknown",
"ttl": {
"latent_dim": 24,
"chunk_compress_factor": 6,
"batch_expander": {
"n_batch_expand": 6
},
"normalizer": {
"scale": 0.25
},
"text_encoder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"text_embedder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"char_emb_dim": 256
},
"convnext": {
"idim": 256,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"attn_encoder": {
"hidden_channels": 256,
"filter_channels": 1024,
"n_heads": 4,
"n_layers": 4,
"p_dropout": 0.0
},
"proj_out": {
"idim": 256,
"odim": 256
}
},
"flow_matching": {
"sig_min": 0
},
"style_encoder": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 256
},
"convnext": {
"idim": 256,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"style_token_layer": {
"input_dim": 256,
"n_style": 50,
"style_key_dim": 256,
"style_value_dim": 256,
"prototype_dim": 256,
"n_units": 256,
"n_heads": 2
}
},
"speech_prompted_text_encoder": {
"text_dim": 256,
"style_dim": 256,
"n_units": 256,
"n_heads": 2
},
"uncond_masker": {
"prob_both_uncond": 0.04,
"prob_text_uncond": 0.01,
"std": 0.1,
"text_dim": 256,
"n_style": 50,
"style_key_dim": 256,
"style_value_dim": 256
},
"vector_field": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 512
},
"time_encoder": {
"time_dim": 64,
"hdim": 256
},
"main_blocks": {
"n_blocks": 4,
"time_cond_layer": {
"idim": 512,
"time_dim": 64
},
"style_cond_layer": {
"idim": 512,
"style_dim": 256
},
"text_cond_layer": {
"idim": 512,
"text_dim": 256,
"n_heads": 4,
"use_residual": true,
"rotary_base": 10000,
"rotary_scale": 10
},
"convnext_0": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 4,
"dilation_lst": [
1,
2,
4,
8
]
},
"convnext_1": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 1,
"dilation_lst": [
1
]
},
"convnext_2": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 1,
"dilation_lst": [
1
]
}
},
"last_convnext": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 4,
"dilation_lst": [
1,
1,
1,
1
]
},
"proj_out": {
"idim": 512,
"chunk_compress_factor": 6,
"ldim": 24
}
}
},
"ae": {
"sample_rate": 44100,
"n_delay": 0,
"base_chunk_size": 512,
"chunk_compress_factor": 1,
"ldim": 24,
"encoder": {
"spec_processor": {
"n_fft": 2048,
"win_length": 2048,
"hop_length": 512,
"n_mels": 228,
"sample_rate": 44100,
"eps": 1e-05,
"norm_mean": 0.0,
"norm_std": 1.0
},
"ksz_init": 7,
"ksz": 7,
"num_layers": 10,
"dilation_lst": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1
],
"intermediate_dim": 2048,
"idim": 1253,
"hdim": 512,
"odim": 24
},
"decoder": {
"ksz_init": 7,
"ksz": 7,
"num_layers": 10,
"dilation_lst": [
1,
2,
4,
1,
2,
4,
1,
1,
1,
1
],
"intermediate_dim": 2048,
"idim": 24,
"hdim": 512,
"head": {
"idim": 512,
"hdim": 2048,
"odim": 512,
"ksz": 3
}
}
},
"dp": {
"latent_dim": 24,
"chunk_compress_factor": 6,
"normalizer": {
"scale": 1.0
},
"sentence_encoder": {
"char_emb_dim": 64,
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"text_embedder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"char_emb_dim": 64
},
"convnext": {
"idim": 64,
"ksz": 5,
"intermediate_dim": 256,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"attn_encoder": {
"hidden_channels": 64,
"filter_channels": 256,
"n_heads": 2,
"n_layers": 2,
"p_dropout": 0.0
},
"proj_out": {
"idim": 64,
"odim": 64
}
},
"style_encoder": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 64
},
"convnext": {
"idim": 64,
"ksz": 5,
"intermediate_dim": 256,
"num_layers": 4,
"dilation_lst": [
1,
1,
1,
1
]
},
"style_token_layer": {
"input_dim": 64,
"n_style": 8,
"style_key_dim": 0,
"style_value_dim": 16,
"prototype_dim": 64,
"n_units": 64,
"n_heads": 2
}
},
"predictor": {
"sentence_dim": 64,
"n_style": 8,
"style_dim": 16,
"hdim": 128,
"n_layer": 2
}
}
}

View File

@@ -0,0 +1,223 @@
tts_version: "v1.5.0"
split: "opensource-en"
ttl_ckpt_path: "unknown.pt"
dp_ckpt_path: "unknown.pt"
ae_ckpt_path: "unknown.pt"
ttl_train: "unknown"
dp_train: "unknown"
ae_train: "unknown"
ttl:
latent_dim: 24
chunk_compress_factor: 6
batch_expander:
n_batch_expand: 6
normalizer:
scale: 0.25
text_encoder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
text_embedder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
char_emb_dim: 256
convnext:
idim: 256
ksz: 5
intermediate_dim: 1024
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
attn_encoder:
hidden_channels: 256
filter_channels: 1024
n_heads: 4
n_layers: 4
p_dropout: 0.0
proj_out:
idim: 256
odim: 256
flow_matching:
sig_min: 0
style_encoder:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 256
convnext:
idim: 256
ksz: 5
intermediate_dim: 1024
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
style_token_layer:
input_dim: 256
n_style: 50
style_key_dim: 256
style_value_dim: 256
prototype_dim: 256
n_units: 256
n_heads: 2
speech_prompted_text_encoder:
text_dim: 256
style_dim: 256
n_units: 256
n_heads: 2
uncond_masker:
prob_both_uncond: 0.04
prob_text_uncond: 0.01
std: 0.1
text_dim: 256
n_style: 50
style_key_dim: 256
style_value_dim: 256
vector_field:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 512
time_encoder:
time_dim: 64
hdim: 256
main_blocks:
n_blocks: 4
time_cond_layer:
idim: 512
time_dim: 64
style_cond_layer:
idim: 512
style_dim: 256
text_cond_layer:
idim: 512
text_dim: 256
n_heads: 4
use_residual: True
rotary_base: 10000
rotary_scale: 10
convnext_0:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 4
dilation_lst: [1, 2, 4, 8]
convnext_1:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 1
dilation_lst: [1]
convnext_2:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 1
dilation_lst: [1]
last_convnext:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 4
dilation_lst: [1, 1, 1, 1]
proj_out:
idim: 512
chunk_compress_factor: 6
ldim: 24
ae:
sample_rate: 44100
n_delay: 0
base_chunk_size: 512
chunk_compress_factor: 1
ldim: 24
encoder:
spec_processor:
n_fft: 2048
win_length: 2048
hop_length: 512
n_mels: 228
sample_rate: 44100
eps: 1e-05
norm_mean: 0.0
norm_std: 1.0
ksz_init: 7
ksz: 7
num_layers: 10
dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
intermediate_dim: 2048
idim: 1253
hdim: 512
odim: 24
decoder:
ksz_init: 7
ksz: 7
num_layers: 10
dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
intermediate_dim: 2048
idim: 24
hdim: 512
head:
idim: 512
hdim: 2048
odim: 512
ksz: 3
dp:
latent_dim: 24
chunk_compress_factor: 6
normalizer:
scale: 1.0
sentence_encoder:
char_emb_dim: 64
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
text_embedder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
char_emb_dim: 64
convnext:
idim: 64
ksz: 5
intermediate_dim: 256
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
attn_encoder:
hidden_channels: 64
filter_channels: 256
n_heads: 2
n_layers: 2
p_dropout: 0.0
proj_out:
idim: 64
odim: 64
style_encoder:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 64
convnext:
idim: 64
ksz: 5
intermediate_dim: 256
num_layers: 4
dilation_lst: [1, 1, 1, 1]
style_token_layer:
input_dim: 64
n_style: 8
style_key_dim: 0
style_value_dim: 16
prototype_dim: 64
n_units: 64
n_heads: 2
predictor:
sentence_dim: 64
n_style: 8
style_dim: 16
hdim: 128
n_layer: 2
unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff