initial commit

2026-01-25 18:58:40 +09:00
commit 77af47274c
101 changed files with 16247 additions and 0 deletions
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -0,0 +1,122 @@
+cmake_minimum_required(VERSION 3.15)
+project(Supertonic_CPP)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Enable aggressive optimization
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Add optimization flags
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -ffast-math")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -DNDEBUG -ffast-math")
+
+# Find required packages
+find_package(PkgConfig REQUIRED)
+find_package(OpenMP)
+
+# ONNX Runtime - Try multiple methods
+# Method 1: Try to find via CMake config
+find_package(onnxruntime QUIET CONFIG)
+
+if(NOT onnxruntime_FOUND)
+    # Method 2: Try pkg-config
+    pkg_check_modules(ONNXRUNTIME QUIET libonnxruntime)
+    
+    if(ONNXRUNTIME_FOUND)
+        set(ONNXRUNTIME_INCLUDE_DIR ${ONNXRUNTIME_INCLUDE_DIRS})
+        set(ONNXRUNTIME_LIB ${ONNXRUNTIME_LIBRARIES})
+    else()
+        # Method 3: Manual search in common locations
+        find_path(ONNXRUNTIME_INCLUDE_DIR 
+            NAMES onnxruntime_cxx_api.h
+            PATHS
+                /usr/local/include
+                /opt/homebrew/include
+                /usr/include
+                ${CMAKE_PREFIX_PATH}/include
+            PATH_SUFFIXES onnxruntime
+        )
+        
+        find_library(ONNXRUNTIME_LIB
+            NAMES onnxruntime libonnxruntime
+            PATHS
+                /usr/local/lib
+                /opt/homebrew/lib
+                /usr/lib
+                ${CMAKE_PREFIX_PATH}/lib
+        )
+    endif()
+    
+    if(NOT ONNXRUNTIME_INCLUDE_DIR OR NOT ONNXRUNTIME_LIB)
+        message(FATAL_ERROR "ONNX Runtime not found. Please install it:\n"
+                            "  macOS: brew install onnxruntime\n"
+                            "  Ubuntu: See README.md for installation instructions")
+    endif()
+    
+    message(STATUS "Found ONNX Runtime:")
+    message(STATUS "  Include: ${ONNXRUNTIME_INCLUDE_DIR}")
+    message(STATUS "  Library: ${ONNXRUNTIME_LIB}")
+endif()
+
+# nlohmann/json
+find_package(nlohmann_json REQUIRED)
+
+# Include directories
+if(NOT onnxruntime_FOUND)
+    include_directories(${ONNXRUNTIME_INCLUDE_DIR})
+endif()
+
+# Helper library
+add_library(tts_helper STATIC
+    helper.cpp
+    helper.h
+)
+
+if(onnxruntime_FOUND)
+    target_link_libraries(tts_helper
+        onnxruntime::onnxruntime
+        nlohmann_json::nlohmann_json
+    )
+else()
+    target_include_directories(tts_helper PUBLIC ${ONNXRUNTIME_INCLUDE_DIR})
+    target_link_libraries(tts_helper
+        ${ONNXRUNTIME_LIB}
+        nlohmann_json::nlohmann_json
+    )
+endif()
+
+# Enable OpenMP if available
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(tts_helper OpenMP::OpenMP_CXX)
+    message(STATUS "OpenMP enabled for parallel processing")
+else()
+    message(WARNING "OpenMP not found - parallel processing will be disabled")
+endif()
+
+# Example executable
+add_executable(example_onnx
+    example_onnx.cpp
+)
+
+if(onnxruntime_FOUND)
+    target_link_libraries(example_onnx
+        tts_helper
+        onnxruntime::onnxruntime
+        nlohmann_json::nlohmann_json
+    )
+else()
+    target_link_libraries(example_onnx
+        tts_helper
+        ${ONNXRUNTIME_LIB}
+        nlohmann_json::nlohmann_json
+    )
+endif()
+
+# Installation
+install(TARGETS example_onnx DESTINATION bin)
+install(TARGETS tts_helper DESTINATION lib)
+install(FILES helper.h DESTINATION include)
+
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -0,0 +1,139 @@
+# Supertonic C++ Implementation
+
+High-performance text-to-speech inference using ONNX Runtime.
+
+## 📰 Update News
+
+**2026.01.06** - 🎉 **Supertonic 2** released with multilingual support! Now supports English (`en`), Korean (`ko`), Spanish (`es`), Portuguese (`pt`), and French (`fr`). [Demo](https://huggingface.co/spaces/Supertone/supertonic-2) | [Models](https://huggingface.co/Supertone/supertonic-2)
+
+**2025.12.10** - Added [6 new voice styles](https://huggingface.co/Supertone/supertonic/tree/b10dbaf18b316159be75b34d24f740008fddd381) (M3, M4, M5, F3, F4, F5). See [Voices](https://supertone-inc.github.io/supertonic-py/voices/) for details
+
+**2025.12.08** - Optimized ONNX models via [OnnxSlim](https://github.com/inisis/OnnxSlim) now available on [Hugging Face Models](https://huggingface.co/Supertone/supertonic)
+
+**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
+
+**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
+
+**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
+
+## Requirements
+
+- C++17 compiler, CMake 3.15+
+- Libraries: ONNX Runtime, nlohmann/json
+
+## Installation
+
+**Ubuntu/Debian:**
+> ⚠️ **Note:** Installation instructions not yet verified.
+
+```bash
+sudo apt-get install -y cmake g++ nlohmann-json3-dev
+wget https://github.com/microsoft/onnxruntime/releases/download/v1.16.3/onnxruntime-linux-x64-1.16.3.tgz
+tar -xzf onnxruntime-linux-x64-1.16.3.tgz
+sudo cp -r onnxruntime-linux-x64-1.16.3/include/* /usr/local/include/
+sudo cp -r onnxruntime-linux-x64-1.16.3/lib/* /usr/local/lib/
+sudo ldconfig
+```
+
+**macOS:**
+```bash
+brew install cmake nlohmann-json onnxruntime
+```
+
+**Windows (vcpkg):**
+> ⚠️ **Note:** Installation instructions not yet verified.
+
+```powershell
+vcpkg install nlohmann-json:x64-windows onnxruntime:x64-windows
+vcpkg integrate install
+```
+
+## Building
+
+```bash
+cd cpp && mkdir build && cd build
+cmake .. && cmake --build . --config Release
+./example_onnx
+```
+
+## Basic Usage
+
+### Example 1: Default Inference
+Run inference with default settings:
+```bash
+./example_onnx
+```
+
+This will use:
+- Voice style: `../assets/voice_styles/M1.json`
+- Text: "This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
+- Output directory: `results/`
+- Total steps: 5
+- Number of generations: 4
+
+### Example 2: Batch Inference
+Process multiple voice styles and texts at once:
+```bash
+./example_onnx \
+  --voice-style ../assets/voice_styles/M1.json,../assets/voice_styles/F1.json \
+  --text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|오늘 아침에 공원을 산책했는데, 새소리와 바람 소리가 너무 좋아서 한참을 멈춰 서서 들었어요." \
+  --lang en,ko \
+  --batch
+```
+
+This will:
+- Use `--batch` flag to enable batch processing mode
+- Generate speech for 2 different voice-text pairs
+- Use male voice style (M1.json) for the first English text
+- Use female voice style (F1.json) for the second Korean text
+- Process both samples in a single batch (automatic text chunking disabled)
+
+### Example 3: High Quality Inference
+Increase denoising steps for better quality:
+```bash
+./example_onnx \
+  --total-step 10 \
+  --voice-style ../assets/voice_styles/M1.json \
+  --text "Increasing the number of denoising steps improves the output's fidelity and overall quality."
+```
+
+This will:
+- Use 10 denoising steps instead of the default 5
+- Produce higher quality output at the cost of slower inference
+
+### Example 4: Long-Form Inference
+For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
+```bash
+./example_onnx \
+  --voice-style ../assets/voice_styles/M1.json \
+  --text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
+```
+
+This will:
+- Automatically split the long text into smaller chunks (max 300 characters by default)
+- Process each chunk separately while maintaining natural speech flow
+- Insert brief silences (0.3 seconds) between chunks for natural pacing
+- Combine all chunks into a single output audio file
+
+**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
+
+## Available Arguments
+
+| Argument | Type | Default | Description |
+|----------|------|---------|-------------|
+| `--onnx-dir` | str | `../assets/onnx` | Path to ONNX model directory |
+| `--total-step` | int | 5 | Number of denoising steps (higher = better quality, slower) |
+| `--speed` | float | 1.05 | Speech speed factor (higher = faster, lower = slower) |
+| `--n-test` | int | 4 | Number of times to generate each sample |
+| `--voice-style` | str | `../assets/voice_styles/M1.json` | Voice style file path(s) (comma-separated for batch) |
+| `--text` | str | (long default text) | Text(s) to synthesize (pipe-separated for batch) |
+| `--lang` | str | `en` | Language(s) for text(s): `en`, `ko`, `es`, `pt`, `fr` (comma-separated for batch) |
+| `--save-dir` | str | `results` | Output directory |
+| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
+
+## Notes
+
+- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
+- **Multilingual Support**: Use `--lang` to specify language(s). Available: `en` (English), `ko` (Korean), `es` (Spanish), `pt` (Portuguese), `fr` (French)
+- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
+- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer
--- a/cpp/example_onnx.cpp
+++ b/cpp/example_onnx.cpp
@@ -0,0 +1,121 @@
+#include "helper.h"
+#include <iostream>
+#include <filesystem>
+#include <algorithm>
+#include <string>
+#include <vector>
+
+namespace fs = std::filesystem;
+
+struct Args {
+    std::string onnx_dir = "../assets/onnx";
+    int total_step = 5;
+    float speed = 1.05f;
+    int n_test = 4;
+    std::vector<std::string> voice_style = {"../assets/voice_styles/M1.json"};
+    std::vector<std::string> text = {
+        "This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
+    };
+    std::vector<std::string> lang = {"en"};
+    std::string save_dir = "results";
+    bool batch = false;
+};
+
+auto splitString = [](const std::string& str, char delim) {
+    std::vector<std::string> result;
+    size_t start = 0, pos;
+    while ((pos = str.find(delim, start)) != std::string::npos) {
+        result.push_back(str.substr(start, pos - start));
+        start = pos + 1;
+    }
+    result.push_back(str.substr(start));
+    return result;
+};
+
+Args parseArgs(int argc, char* argv[]) {
+    Args args;
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg == "--onnx-dir" && i + 1 < argc) args.onnx_dir = argv[++i];
+        else if (arg == "--total-step" && i + 1 < argc) args.total_step = std::stoi(argv[++i]);
+        else if (arg == "--speed" && i + 1 < argc) args.speed = std::stof(argv[++i]);
+        else if (arg == "--n-test" && i + 1 < argc) args.n_test = std::stoi(argv[++i]);
+        else if (arg == "--voice-style" && i + 1 < argc) args.voice_style = splitString(argv[++i], ',');
+        else if (arg == "--text" && i + 1 < argc) args.text = splitString(argv[++i], '|');
+        else if (arg == "--lang" && i + 1 < argc) args.lang = splitString(argv[++i], ',');
+        else if (arg == "--save-dir" && i + 1 < argc) args.save_dir = argv[++i];
+        else if (arg == "--batch") args.batch = true;
+    }
+    return args;
+}
+
+int main(int argc, char* argv[]) {
+    std::cout << "=== TTS Inference with ONNX Runtime (C++) ===\n\n";
+    
+    // --- 1. Parse arguments --- //
+    Args args = parseArgs(argc, argv);
+    int total_step = args.total_step;
+    float speed = args.speed;
+    int n_test = args.n_test;
+    std::string save_dir = args.save_dir;
+    std::vector<std::string> voice_style_paths = args.voice_style;
+    std::vector<std::string> text_list = args.text;
+    std::vector<std::string> lang_list = args.lang;
+    bool batch = args.batch;
+    
+    if (voice_style_paths.size() != text_list.size()) {
+        std::cerr << "Error: Number of voice styles (" << voice_style_paths.size() 
+                  << ") must match number of texts (" << text_list.size() << ")\n";
+        return 1;
+    }
+    int bsz = voice_style_paths.size();
+    
+    // --- 2. Load Text to Speech --- //
+    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "TTS");
+    Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(
+        OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault
+    );
+    
+    auto text_to_speech = loadTextToSpeech(env, args.onnx_dir, false);
+    std::cout << std::endl;
+    
+    // --- 3. Load Voice Style --- //
+    auto style = loadVoiceStyle(voice_style_paths, true);
+    
+    // --- 4. Synthesize speech --- //
+    fs::create_directories(save_dir);
+    
+    for (int n = 0; n < n_test; n++) {
+        std::cout << "\n[" << (n + 1) << "/" << n_test << "] Starting synthesis...\n";
+        
+        auto result = timer("Generating speech from text", [&]() {
+            if (batch) {
+                return text_to_speech->batch(memory_info, text_list, lang_list, style, total_step, speed);
+            } else {
+                return text_to_speech->call(memory_info, text_list[0], lang_list[0], style, total_step, speed);
+            }
+        });
+        
+        int sample_rate = text_to_speech->getSampleRate();
+        int wav_shape_1 = result.wav.size() / bsz;
+        
+        for (int b = 0; b < bsz; b++) {
+            std::string fname = sanitizeFilename(text_list[b], 20) + "_" + std::to_string(n + 1) + ".wav";
+            int wav_len = static_cast<int>(sample_rate * result.duration[b]);
+            
+            std::vector<float> wav_out(
+                result.wav.begin() + b * wav_shape_1,
+                result.wav.begin() + b * wav_shape_1 + wav_len
+            );
+            
+            std::string output_path = save_dir + "/" + fname;
+            writeWavFile(output_path, wav_out, sample_rate);
+            std::cout << "Saved: " << output_path << "\n";
+        }
+        
+        clearTensorBuffers();
+    }
+    
+    std::cout << "\n=== Synthesis completed successfully! ===\n";
+    return 0;
+}
--- a/cpp/helper.cpp
+++ b/cpp/helper.cpp
--- a/cpp/helper.h
+++ b/cpp/helper.h
@@ -0,0 +1,229 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <onnxruntime_cxx_api.h>
+
+// Available languages for multilingual TTS
+extern const std::vector<std::string> AVAILABLE_LANGS;
+
+/**
+ * Configuration structure
+ */
+struct Config {
+    struct AEConfig {
+        int sample_rate;
+        int base_chunk_size;
+    } ae;
+    
+    struct TTLConfig {
+        int chunk_compress_factor;
+        int latent_dim;
+    } ttl;
+};
+
+/**
+ * Unicode text processor
+ */
+class UnicodeProcessor {
+public:
+    explicit UnicodeProcessor(const std::string& unicode_indexer_json_path);
+
+    // Process text list to text IDs and mask
+    void call(
+        const std::vector<std::string>& text_list,
+        const std::vector<std::string>& lang_list,
+        std::vector<std::vector<int64_t>>& text_ids,
+        std::vector<std::vector<std::vector<float>>>& text_mask
+    );
+
+private:
+    std::vector<int64_t> indexer_;
+    
+    std::string preprocessText(const std::string& text, const std::string& lang);
+    std::vector<uint16_t> textToUnicodeValues(const std::string& text);
+    std::vector<std::vector<std::vector<float>>> getTextMask(
+        const std::vector<int64_t>& text_ids_lengths
+    );
+};
+
+/**
+ * Style class
+ */
+class Style {
+public:
+    Style(const std::vector<float>& ttl_data, const std::vector<int64_t>& ttl_shape,
+          const std::vector<float>& dp_data, const std::vector<int64_t>& dp_shape);
+    
+    const std::vector<float>& getTtlData() const { return ttl_data_; }
+    const std::vector<float>& getDpData() const { return dp_data_; }
+    const std::vector<int64_t>& getTtlShape() const { return ttl_shape_; }
+    const std::vector<int64_t>& getDpShape() const { return dp_shape_; }
+
+private:
+    std::vector<float> ttl_data_;
+    std::vector<float> dp_data_;
+    std::vector<int64_t> ttl_shape_;
+    std::vector<int64_t> dp_shape_;
+};
+
+/**
+ * TextToSpeech class
+ */
+class TextToSpeech {
+public:
+    TextToSpeech(
+        const Config& cfgs,
+        UnicodeProcessor* text_processor,
+        Ort::Session* dp_ort,
+        Ort::Session* text_enc_ort,
+        Ort::Session* vector_est_ort,
+        Ort::Session* vocoder_ort
+    );
+    
+    struct SynthesisResult {
+        std::vector<float> wav;
+        std::vector<float> duration;
+    };
+    
+    SynthesisResult call(
+        Ort::MemoryInfo& memory_info,
+        const std::string& text,
+        const std::string& lang,
+        const Style& style,
+        int total_step,
+        float speed = 1.05f,
+        float silence_duration = 0.3f
+    );
+    
+    SynthesisResult batch(
+        Ort::MemoryInfo& memory_info,
+        const std::vector<std::string>& text_list,
+        const std::vector<std::string>& lang_list,
+        const Style& style,
+        int total_step,
+        float speed = 1.05f
+    );
+    
+    int getSampleRate() const { return sample_rate_; }
+
+private:
+    SynthesisResult _infer(
+        Ort::MemoryInfo& memory_info,
+        const std::vector<std::string>& text_list,
+        const std::vector<std::string>& lang_list,
+        const Style& style,
+        int total_step,
+        float speed = 1.05f
+    );
+    Config cfgs_;
+    UnicodeProcessor* text_processor_;
+    Ort::Session* dp_ort_;
+    Ort::Session* text_enc_ort_;
+    Ort::Session* vector_est_ort_;
+    Ort::Session* vocoder_ort_;
+    int sample_rate_;
+    int base_chunk_size_;
+    int chunk_compress_factor_;
+    int ldim_;
+    
+    void sampleNoisyLatent(
+        const std::vector<float>& duration,
+        std::vector<std::vector<std::vector<float>>>& noisy_latent,
+        std::vector<std::vector<std::vector<float>>>& latent_mask
+    );
+};
+
+// Utility functions
+std::vector<std::vector<std::vector<float>>> lengthToMask(
+    const std::vector<int64_t>& lengths, int max_len = -1
+);
+
+std::vector<std::vector<std::vector<float>>> getLatentMask(
+    const std::vector<int64_t>& wav_lengths,
+    int base_chunk_size,
+    int chunk_compress_factor
+);
+
+// ONNX model loading
+struct OnnxModels {
+    std::unique_ptr<Ort::Session> dp;
+    std::unique_ptr<Ort::Session> text_enc;
+    std::unique_ptr<Ort::Session> vector_est;
+    std::unique_ptr<Ort::Session> vocoder;
+};
+
+std::unique_ptr<Ort::Session> loadOnnx(
+    Ort::Env& env,
+    const std::string& onnx_path,
+    const Ort::SessionOptions& opts
+);
+
+OnnxModels loadOnnxAll(
+    Ort::Env& env,
+    const std::string& onnx_dir,
+    const Ort::SessionOptions& opts
+);
+
+// Configuration and processor loading
+Config loadCfgs(const std::string& onnx_dir);
+
+std::unique_ptr<UnicodeProcessor> loadTextProcessor(const std::string& onnx_dir);
+
+// Voice style loading
+Style loadVoiceStyle(const std::vector<std::string>& voice_style_paths, bool verbose = false);
+
+// TextToSpeech loading
+std::unique_ptr<TextToSpeech> loadTextToSpeech(
+    Ort::Env& env,
+    const std::string& onnx_dir,
+    bool use_gpu = false
+);
+
+// WAV file writing
+void writeWavFile(
+    const std::string& filename,
+    const std::vector<float>& audio_data,
+    int sample_rate
+);
+
+// Tensor conversion utilities
+void clearTensorBuffers();
+
+Ort::Value arrayToTensor(
+    Ort::MemoryInfo& memory_info,
+    const std::vector<std::vector<std::vector<float>>>& array,
+    const std::vector<int64_t>& dims
+);
+
+Ort::Value intArrayToTensor(
+    Ort::MemoryInfo& memory_info,
+    const std::vector<std::vector<int64_t>>& array,
+    const std::vector<int64_t>& dims
+);
+
+// JSON loading helpers
+std::vector<int64_t> loadJsonInt64(const std::string& file_path);
+
+// Timer utility
+template<typename Func>
+auto timer(const std::string& name, Func&& func) -> decltype(func()) {
+    auto start = std::chrono::high_resolution_clock::now();
+    std::cout << name << "..." << std::endl;
+    auto result = func();
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed = end - start;
+    std::cout << "  -> " << name << " completed in " 
+              << std::fixed << std::setprecision(2) << elapsed.count() << " sec" << std::endl;
+    return result;
+}
+
+// Sanitize filename
+std::string sanitizeFilename(const std::string& text, int max_len);
+
+// Chunk text into manageable segments
+std::vector<std::string> chunkText(const std::string& text, int max_len = 300);