tts_version: "v1.5.0" split: "opensource-en" ttl_ckpt_path: "unknown.pt" dp_ckpt_path: "unknown.pt" ae_ckpt_path: "unknown.pt" ttl_train: "unknown" dp_train: "unknown" ae_train: "unknown" ttl: latent_dim: 24 chunk_compress_factor: 6 batch_expander: n_batch_expand: 6 normalizer: scale: 0.25 text_encoder: char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" text_embedder: char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" char_emb_dim: 256 convnext: idim: 256 ksz: 5 intermediate_dim: 1024 num_layers: 6 dilation_lst: [1, 1, 1, 1, 1, 1] attn_encoder: hidden_channels: 256 filter_channels: 1024 n_heads: 4 n_layers: 4 p_dropout: 0.0 proj_out: idim: 256 odim: 256 flow_matching: sig_min: 0 style_encoder: proj_in: ldim: 24 chunk_compress_factor: 6 odim: 256 convnext: idim: 256 ksz: 5 intermediate_dim: 1024 num_layers: 6 dilation_lst: [1, 1, 1, 1, 1, 1] style_token_layer: input_dim: 256 n_style: 50 style_key_dim: 256 style_value_dim: 256 prototype_dim: 256 n_units: 256 n_heads: 2 speech_prompted_text_encoder: text_dim: 256 style_dim: 256 n_units: 256 n_heads: 2 uncond_masker: prob_both_uncond: 0.04 prob_text_uncond: 0.01 std: 0.1 text_dim: 256 n_style: 50 style_key_dim: 256 style_value_dim: 256 vector_field: proj_in: ldim: 24 chunk_compress_factor: 6 odim: 512 time_encoder: time_dim: 64 hdim: 256 main_blocks: n_blocks: 4 time_cond_layer: idim: 512 time_dim: 64 style_cond_layer: idim: 512 style_dim: 256 text_cond_layer: idim: 512 text_dim: 256 n_heads: 4 use_residual: True rotary_base: 10000 rotary_scale: 10 convnext_0: idim: 512 ksz: 5 intermediate_dim: 1024 num_layers: 4 dilation_lst: [1, 2, 4, 8] convnext_1: idim: 512 ksz: 5 intermediate_dim: 1024 num_layers: 1 dilation_lst: [1] convnext_2: idim: 512 ksz: 5 intermediate_dim: 1024 num_layers: 1 dilation_lst: [1] last_convnext: idim: 512 ksz: 5 intermediate_dim: 1024 num_layers: 4 dilation_lst: [1, 1, 1, 1] proj_out: idim: 512 chunk_compress_factor: 6 ldim: 24 ae: sample_rate: 44100 n_delay: 0 base_chunk_size: 512 chunk_compress_factor: 1 ldim: 24 encoder: spec_processor: n_fft: 2048 win_length: 2048 hop_length: 512 n_mels: 228 sample_rate: 44100 eps: 1e-05 norm_mean: 0.0 norm_std: 1.0 ksz_init: 7 ksz: 7 num_layers: 10 dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] intermediate_dim: 2048 idim: 1253 hdim: 512 odim: 24 decoder: ksz_init: 7 ksz: 7 num_layers: 10 dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1] intermediate_dim: 2048 idim: 24 hdim: 512 head: idim: 512 hdim: 2048 odim: 512 ksz: 3 dp: latent_dim: 24 chunk_compress_factor: 6 normalizer: scale: 1.0 sentence_encoder: char_emb_dim: 64 char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" text_embedder: char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json" char_emb_dim: 64 convnext: idim: 64 ksz: 5 intermediate_dim: 256 num_layers: 6 dilation_lst: [1, 1, 1, 1, 1, 1] attn_encoder: hidden_channels: 64 filter_channels: 256 n_heads: 2 n_layers: 2 p_dropout: 0.0 proj_out: idim: 64 odim: 64 style_encoder: proj_in: ldim: 24 chunk_compress_factor: 6 odim: 64 convnext: idim: 64 ksz: 5 intermediate_dim: 256 num_layers: 4 dilation_lst: [1, 1, 1, 1] style_token_layer: input_dim: 64 n_style: 8 style_key_dim: 0 style_value_dim: 16 prototype_dim: 64 n_units: 64 n_heads: 2 predictor: sentence_dim: 64 n_style: 8 style_dim: 16 hdim: 128 n_layer: 2 unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy" unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json" window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json" filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"