tts_version: "v1.5.0"

split: "opensource-en"

ttl_ckpt_path: "unknown.pt"

dp_ckpt_path: "unknown.pt"

ae_ckpt_path: "unknown.pt"

ttl_train: "unknown"

dp_train: "unknown"

ae_train: "unknown"

ttl:
    latent_dim: 24
    chunk_compress_factor: 6
    batch_expander:
        n_batch_expand: 6
    normalizer:
        scale: 0.25
    text_encoder:
        char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
        text_embedder:
            char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
            char_emb_dim: 256
        convnext:
            idim: 256
            ksz: 5
            intermediate_dim: 1024
            num_layers: 6
            dilation_lst: [1, 1, 1, 1, 1, 1]
        attn_encoder:
            hidden_channels: 256
            filter_channels: 1024
            n_heads: 4
            n_layers: 4
            p_dropout: 0.0
        proj_out:
            idim: 256
            odim: 256
    flow_matching:
        sig_min: 0
    style_encoder:
        proj_in:
            ldim: 24
            chunk_compress_factor: 6
            odim: 256
        convnext:
            idim: 256
            ksz: 5
            intermediate_dim: 1024
            num_layers: 6
            dilation_lst: [1, 1, 1, 1, 1, 1]
        style_token_layer:
            input_dim: 256
            n_style: 50
            style_key_dim: 256
            style_value_dim: 256
            prototype_dim: 256
            n_units: 256
            n_heads: 2
    speech_prompted_text_encoder:
        text_dim: 256
        style_dim: 256
        n_units: 256
        n_heads: 2
    uncond_masker:
        prob_both_uncond: 0.04
        prob_text_uncond: 0.01
        std: 0.1
        text_dim: 256
        n_style: 50
        style_key_dim: 256
        style_value_dim: 256
    vector_field:
        proj_in:
            ldim: 24
            chunk_compress_factor: 6
            odim: 512
        time_encoder:
            time_dim: 64
            hdim: 256
        main_blocks:
            n_blocks: 4
            time_cond_layer:
                idim: 512
                time_dim: 64
            style_cond_layer:
                idim: 512
                style_dim: 256
            text_cond_layer:
                idim: 512
                text_dim: 256
                n_heads: 4
                use_residual: True
                rotary_base: 10000
                rotary_scale: 10
            convnext_0:
                idim: 512
                ksz: 5
                intermediate_dim: 1024
                num_layers: 4
                dilation_lst: [1, 2, 4, 8]
            convnext_1:
                idim: 512
                ksz: 5
                intermediate_dim: 1024
                num_layers: 1
                dilation_lst: [1]
            convnext_2:
                idim: 512
                ksz: 5
                intermediate_dim: 1024
                num_layers: 1
                dilation_lst: [1]
        last_convnext:
            idim: 512
            ksz: 5
            intermediate_dim: 1024
            num_layers: 4
            dilation_lst: [1, 1, 1, 1]
        proj_out:
            idim: 512
            chunk_compress_factor: 6
            ldim: 24

ae:
    sample_rate: 44100
    n_delay: 0
    base_chunk_size: 512
    chunk_compress_factor: 1
    ldim: 24
    encoder:
        spec_processor:
            n_fft: 2048
            win_length: 2048
            hop_length: 512
            n_mels: 228
            sample_rate: 44100
            eps: 1e-05
            norm_mean: 0.0
            norm_std: 1.0
        ksz_init: 7
        ksz: 7
        num_layers: 10
        dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
        intermediate_dim: 2048
        idim: 1253
        hdim: 512
        odim: 24
    decoder:
        ksz_init: 7
        ksz: 7
        num_layers: 10
        dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
        intermediate_dim: 2048
        idim: 24
        hdim: 512
        head:
            idim: 512
            hdim: 2048
            odim: 512
            ksz: 3

dp:
    latent_dim: 24
    chunk_compress_factor: 6
    normalizer:
        scale: 1.0
    sentence_encoder:
        char_emb_dim: 64
        char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
        text_embedder:
            char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
            char_emb_dim: 64
        convnext:
            idim: 64
            ksz: 5
            intermediate_dim: 256
            num_layers: 6
            dilation_lst: [1, 1, 1, 1, 1, 1]
        attn_encoder:
            hidden_channels: 64
            filter_channels: 256
            n_heads: 2
            n_layers: 2
            p_dropout: 0.0
        proj_out:
            idim: 64
            odim: 64
    style_encoder:
        proj_in:
            ldim: 24
            chunk_compress_factor: 6
            odim: 64
        convnext:
            idim: 64
            ksz: 5
            intermediate_dim: 256
            num_layers: 4
            dilation_lst: [1, 1, 1, 1]
        style_token_layer:
            input_dim: 64
            n_style: 8
            style_key_dim: 0
            style_value_dim: 16
            prototype_dim: 64
            n_units: 64
            n_heads: 2
    predictor:
        sentence_dim: 64
        n_style: 8
        style_dim: 16
        hdim: 128
        n_layer: 2

unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"