add supertonic
This commit is contained in:
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/duration_predictor.onnx
Normal file
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/duration_predictor.onnx
Normal file
Binary file not shown.
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/text_encoder.onnx
Normal file
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/text_encoder.onnx
Normal file
Binary file not shown.
316
Cs_HMI/SubProject/SuperTonic/assets/onnx/tts.json
Normal file
316
Cs_HMI/SubProject/SuperTonic/assets/onnx/tts.json
Normal file
@@ -0,0 +1,316 @@
|
||||
{
|
||||
"tts_version": "v1.5.0",
|
||||
"split": "opensource-en",
|
||||
"ttl_ckpt_path": "unknown.pt",
|
||||
"dp_ckpt_path": "unknown.pt",
|
||||
"ae_ckpt_path": "unknown.pt",
|
||||
"ttl_train": "unknown",
|
||||
"dp_train": "unknown",
|
||||
"ae_train": "unknown",
|
||||
"ttl": {
|
||||
"latent_dim": 24,
|
||||
"chunk_compress_factor": 6,
|
||||
"batch_expander": {
|
||||
"n_batch_expand": 6
|
||||
},
|
||||
"normalizer": {
|
||||
"scale": 0.25
|
||||
},
|
||||
"text_encoder": {
|
||||
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
||||
"text_embedder": {
|
||||
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
||||
"char_emb_dim": 256
|
||||
},
|
||||
"convnext": {
|
||||
"idim": 256,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 1024,
|
||||
"num_layers": 6,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
]
|
||||
},
|
||||
"attn_encoder": {
|
||||
"hidden_channels": 256,
|
||||
"filter_channels": 1024,
|
||||
"n_heads": 4,
|
||||
"n_layers": 4,
|
||||
"p_dropout": 0.0
|
||||
},
|
||||
"proj_out": {
|
||||
"idim": 256,
|
||||
"odim": 256
|
||||
}
|
||||
},
|
||||
"flow_matching": {
|
||||
"sig_min": 0
|
||||
},
|
||||
"style_encoder": {
|
||||
"proj_in": {
|
||||
"ldim": 24,
|
||||
"chunk_compress_factor": 6,
|
||||
"odim": 256
|
||||
},
|
||||
"convnext": {
|
||||
"idim": 256,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 1024,
|
||||
"num_layers": 6,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
]
|
||||
},
|
||||
"style_token_layer": {
|
||||
"input_dim": 256,
|
||||
"n_style": 50,
|
||||
"style_key_dim": 256,
|
||||
"style_value_dim": 256,
|
||||
"prototype_dim": 256,
|
||||
"n_units": 256,
|
||||
"n_heads": 2
|
||||
}
|
||||
},
|
||||
"speech_prompted_text_encoder": {
|
||||
"text_dim": 256,
|
||||
"style_dim": 256,
|
||||
"n_units": 256,
|
||||
"n_heads": 2
|
||||
},
|
||||
"uncond_masker": {
|
||||
"prob_both_uncond": 0.04,
|
||||
"prob_text_uncond": 0.01,
|
||||
"std": 0.1,
|
||||
"text_dim": 256,
|
||||
"n_style": 50,
|
||||
"style_key_dim": 256,
|
||||
"style_value_dim": 256
|
||||
},
|
||||
"vector_field": {
|
||||
"proj_in": {
|
||||
"ldim": 24,
|
||||
"chunk_compress_factor": 6,
|
||||
"odim": 512
|
||||
},
|
||||
"time_encoder": {
|
||||
"time_dim": 64,
|
||||
"hdim": 256
|
||||
},
|
||||
"main_blocks": {
|
||||
"n_blocks": 4,
|
||||
"time_cond_layer": {
|
||||
"idim": 512,
|
||||
"time_dim": 64
|
||||
},
|
||||
"style_cond_layer": {
|
||||
"idim": 512,
|
||||
"style_dim": 256
|
||||
},
|
||||
"text_cond_layer": {
|
||||
"idim": 512,
|
||||
"text_dim": 256,
|
||||
"n_heads": 4,
|
||||
"use_residual": true,
|
||||
"rotary_base": 10000,
|
||||
"rotary_scale": 10
|
||||
},
|
||||
"convnext_0": {
|
||||
"idim": 512,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 1024,
|
||||
"num_layers": 4,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
8
|
||||
]
|
||||
},
|
||||
"convnext_1": {
|
||||
"idim": 512,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 1024,
|
||||
"num_layers": 1,
|
||||
"dilation_lst": [
|
||||
1
|
||||
]
|
||||
},
|
||||
"convnext_2": {
|
||||
"idim": 512,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 1024,
|
||||
"num_layers": 1,
|
||||
"dilation_lst": [
|
||||
1
|
||||
]
|
||||
}
|
||||
},
|
||||
"last_convnext": {
|
||||
"idim": 512,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 1024,
|
||||
"num_layers": 4,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
]
|
||||
},
|
||||
"proj_out": {
|
||||
"idim": 512,
|
||||
"chunk_compress_factor": 6,
|
||||
"ldim": 24
|
||||
}
|
||||
}
|
||||
},
|
||||
"ae": {
|
||||
"sample_rate": 44100,
|
||||
"n_delay": 0,
|
||||
"base_chunk_size": 512,
|
||||
"chunk_compress_factor": 1,
|
||||
"ldim": 24,
|
||||
"encoder": {
|
||||
"spec_processor": {
|
||||
"n_fft": 2048,
|
||||
"win_length": 2048,
|
||||
"hop_length": 512,
|
||||
"n_mels": 228,
|
||||
"sample_rate": 44100,
|
||||
"eps": 1e-05,
|
||||
"norm_mean": 0.0,
|
||||
"norm_std": 1.0
|
||||
},
|
||||
"ksz_init": 7,
|
||||
"ksz": 7,
|
||||
"num_layers": 10,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
],
|
||||
"intermediate_dim": 2048,
|
||||
"idim": 1253,
|
||||
"hdim": 512,
|
||||
"odim": 24
|
||||
},
|
||||
"decoder": {
|
||||
"ksz_init": 7,
|
||||
"ksz": 7,
|
||||
"num_layers": 10,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
1,
|
||||
2,
|
||||
4,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
],
|
||||
"intermediate_dim": 2048,
|
||||
"idim": 24,
|
||||
"hdim": 512,
|
||||
"head": {
|
||||
"idim": 512,
|
||||
"hdim": 2048,
|
||||
"odim": 512,
|
||||
"ksz": 3
|
||||
}
|
||||
}
|
||||
},
|
||||
"dp": {
|
||||
"latent_dim": 24,
|
||||
"chunk_compress_factor": 6,
|
||||
"normalizer": {
|
||||
"scale": 1.0
|
||||
},
|
||||
"sentence_encoder": {
|
||||
"char_emb_dim": 64,
|
||||
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
||||
"text_embedder": {
|
||||
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
||||
"char_emb_dim": 64
|
||||
},
|
||||
"convnext": {
|
||||
"idim": 64,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 256,
|
||||
"num_layers": 6,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
]
|
||||
},
|
||||
"attn_encoder": {
|
||||
"hidden_channels": 64,
|
||||
"filter_channels": 256,
|
||||
"n_heads": 2,
|
||||
"n_layers": 2,
|
||||
"p_dropout": 0.0
|
||||
},
|
||||
"proj_out": {
|
||||
"idim": 64,
|
||||
"odim": 64
|
||||
}
|
||||
},
|
||||
"style_encoder": {
|
||||
"proj_in": {
|
||||
"ldim": 24,
|
||||
"chunk_compress_factor": 6,
|
||||
"odim": 64
|
||||
},
|
||||
"convnext": {
|
||||
"idim": 64,
|
||||
"ksz": 5,
|
||||
"intermediate_dim": 256,
|
||||
"num_layers": 4,
|
||||
"dilation_lst": [
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1
|
||||
]
|
||||
},
|
||||
"style_token_layer": {
|
||||
"input_dim": 64,
|
||||
"n_style": 8,
|
||||
"style_key_dim": 0,
|
||||
"style_value_dim": 16,
|
||||
"prototype_dim": 64,
|
||||
"n_units": 64,
|
||||
"n_heads": 2
|
||||
}
|
||||
},
|
||||
"predictor": {
|
||||
"sentence_dim": 64,
|
||||
"n_style": 8,
|
||||
"style_dim": 16,
|
||||
"hdim": 128,
|
||||
"n_layer": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
223
Cs_HMI/SubProject/SuperTonic/assets/onnx/tts.yml
Normal file
223
Cs_HMI/SubProject/SuperTonic/assets/onnx/tts.yml
Normal file
@@ -0,0 +1,223 @@
|
||||
tts_version: "v1.5.0"
|
||||
|
||||
split: "opensource-en"
|
||||
|
||||
ttl_ckpt_path: "unknown.pt"
|
||||
|
||||
dp_ckpt_path: "unknown.pt"
|
||||
|
||||
ae_ckpt_path: "unknown.pt"
|
||||
|
||||
ttl_train: "unknown"
|
||||
|
||||
dp_train: "unknown"
|
||||
|
||||
ae_train: "unknown"
|
||||
|
||||
ttl:
|
||||
latent_dim: 24
|
||||
chunk_compress_factor: 6
|
||||
batch_expander:
|
||||
n_batch_expand: 6
|
||||
normalizer:
|
||||
scale: 0.25
|
||||
text_encoder:
|
||||
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
||||
text_embedder:
|
||||
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
||||
char_emb_dim: 256
|
||||
convnext:
|
||||
idim: 256
|
||||
ksz: 5
|
||||
intermediate_dim: 1024
|
||||
num_layers: 6
|
||||
dilation_lst: [1, 1, 1, 1, 1, 1]
|
||||
attn_encoder:
|
||||
hidden_channels: 256
|
||||
filter_channels: 1024
|
||||
n_heads: 4
|
||||
n_layers: 4
|
||||
p_dropout: 0.0
|
||||
proj_out:
|
||||
idim: 256
|
||||
odim: 256
|
||||
flow_matching:
|
||||
sig_min: 0
|
||||
style_encoder:
|
||||
proj_in:
|
||||
ldim: 24
|
||||
chunk_compress_factor: 6
|
||||
odim: 256
|
||||
convnext:
|
||||
idim: 256
|
||||
ksz: 5
|
||||
intermediate_dim: 1024
|
||||
num_layers: 6
|
||||
dilation_lst: [1, 1, 1, 1, 1, 1]
|
||||
style_token_layer:
|
||||
input_dim: 256
|
||||
n_style: 50
|
||||
style_key_dim: 256
|
||||
style_value_dim: 256
|
||||
prototype_dim: 256
|
||||
n_units: 256
|
||||
n_heads: 2
|
||||
speech_prompted_text_encoder:
|
||||
text_dim: 256
|
||||
style_dim: 256
|
||||
n_units: 256
|
||||
n_heads: 2
|
||||
uncond_masker:
|
||||
prob_both_uncond: 0.04
|
||||
prob_text_uncond: 0.01
|
||||
std: 0.1
|
||||
text_dim: 256
|
||||
n_style: 50
|
||||
style_key_dim: 256
|
||||
style_value_dim: 256
|
||||
vector_field:
|
||||
proj_in:
|
||||
ldim: 24
|
||||
chunk_compress_factor: 6
|
||||
odim: 512
|
||||
time_encoder:
|
||||
time_dim: 64
|
||||
hdim: 256
|
||||
main_blocks:
|
||||
n_blocks: 4
|
||||
time_cond_layer:
|
||||
idim: 512
|
||||
time_dim: 64
|
||||
style_cond_layer:
|
||||
idim: 512
|
||||
style_dim: 256
|
||||
text_cond_layer:
|
||||
idim: 512
|
||||
text_dim: 256
|
||||
n_heads: 4
|
||||
use_residual: True
|
||||
rotary_base: 10000
|
||||
rotary_scale: 10
|
||||
convnext_0:
|
||||
idim: 512
|
||||
ksz: 5
|
||||
intermediate_dim: 1024
|
||||
num_layers: 4
|
||||
dilation_lst: [1, 2, 4, 8]
|
||||
convnext_1:
|
||||
idim: 512
|
||||
ksz: 5
|
||||
intermediate_dim: 1024
|
||||
num_layers: 1
|
||||
dilation_lst: [1]
|
||||
convnext_2:
|
||||
idim: 512
|
||||
ksz: 5
|
||||
intermediate_dim: 1024
|
||||
num_layers: 1
|
||||
dilation_lst: [1]
|
||||
last_convnext:
|
||||
idim: 512
|
||||
ksz: 5
|
||||
intermediate_dim: 1024
|
||||
num_layers: 4
|
||||
dilation_lst: [1, 1, 1, 1]
|
||||
proj_out:
|
||||
idim: 512
|
||||
chunk_compress_factor: 6
|
||||
ldim: 24
|
||||
|
||||
ae:
|
||||
sample_rate: 44100
|
||||
n_delay: 0
|
||||
base_chunk_size: 512
|
||||
chunk_compress_factor: 1
|
||||
ldim: 24
|
||||
encoder:
|
||||
spec_processor:
|
||||
n_fft: 2048
|
||||
win_length: 2048
|
||||
hop_length: 512
|
||||
n_mels: 228
|
||||
sample_rate: 44100
|
||||
eps: 1e-05
|
||||
norm_mean: 0.0
|
||||
norm_std: 1.0
|
||||
ksz_init: 7
|
||||
ksz: 7
|
||||
num_layers: 10
|
||||
dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
||||
intermediate_dim: 2048
|
||||
idim: 1253
|
||||
hdim: 512
|
||||
odim: 24
|
||||
decoder:
|
||||
ksz_init: 7
|
||||
ksz: 7
|
||||
num_layers: 10
|
||||
dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
|
||||
intermediate_dim: 2048
|
||||
idim: 24
|
||||
hdim: 512
|
||||
head:
|
||||
idim: 512
|
||||
hdim: 2048
|
||||
odim: 512
|
||||
ksz: 3
|
||||
|
||||
dp:
|
||||
latent_dim: 24
|
||||
chunk_compress_factor: 6
|
||||
normalizer:
|
||||
scale: 1.0
|
||||
sentence_encoder:
|
||||
char_emb_dim: 64
|
||||
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
||||
text_embedder:
|
||||
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
||||
char_emb_dim: 64
|
||||
convnext:
|
||||
idim: 64
|
||||
ksz: 5
|
||||
intermediate_dim: 256
|
||||
num_layers: 6
|
||||
dilation_lst: [1, 1, 1, 1, 1, 1]
|
||||
attn_encoder:
|
||||
hidden_channels: 64
|
||||
filter_channels: 256
|
||||
n_heads: 2
|
||||
n_layers: 2
|
||||
p_dropout: 0.0
|
||||
proj_out:
|
||||
idim: 64
|
||||
odim: 64
|
||||
style_encoder:
|
||||
proj_in:
|
||||
ldim: 24
|
||||
chunk_compress_factor: 6
|
||||
odim: 64
|
||||
convnext:
|
||||
idim: 64
|
||||
ksz: 5
|
||||
intermediate_dim: 256
|
||||
num_layers: 4
|
||||
dilation_lst: [1, 1, 1, 1]
|
||||
style_token_layer:
|
||||
input_dim: 64
|
||||
n_style: 8
|
||||
style_key_dim: 0
|
||||
style_value_dim: 16
|
||||
prototype_dim: 64
|
||||
n_units: 64
|
||||
n_heads: 2
|
||||
predictor:
|
||||
sentence_dim: 64
|
||||
n_style: 8
|
||||
style_dim: 16
|
||||
hdim: 128
|
||||
n_layer: 2
|
||||
|
||||
unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
|
||||
unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
|
||||
window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
|
||||
filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"
|
||||
File diff suppressed because one or more lines are too long
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/vector_estimator.onnx
Normal file
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/vector_estimator.onnx
Normal file
Binary file not shown.
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/vocoder.onnx
Normal file
BIN
Cs_HMI/SubProject/SuperTonic/assets/onnx/vocoder.onnx
Normal file
Binary file not shown.
Reference in New Issue
Block a user