initial commit

This commit is contained in:
2026-01-25 18:58:40 +09:00
commit 77af47274c
101 changed files with 16247 additions and 0 deletions

122
cpp/CMakeLists.txt Normal file
View File

@@ -0,0 +1,122 @@
cmake_minimum_required(VERSION 3.15)
project(Supertonic_CPP)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# Enable aggressive optimization
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# Add optimization flags
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG -ffast-math")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -DNDEBUG -ffast-math")
# Find required packages
find_package(PkgConfig REQUIRED)
find_package(OpenMP)
# ONNX Runtime - Try multiple methods
# Method 1: Try to find via CMake config
find_package(onnxruntime QUIET CONFIG)
if(NOT onnxruntime_FOUND)
# Method 2: Try pkg-config
pkg_check_modules(ONNXRUNTIME QUIET libonnxruntime)
if(ONNXRUNTIME_FOUND)
set(ONNXRUNTIME_INCLUDE_DIR ${ONNXRUNTIME_INCLUDE_DIRS})
set(ONNXRUNTIME_LIB ${ONNXRUNTIME_LIBRARIES})
else()
# Method 3: Manual search in common locations
find_path(ONNXRUNTIME_INCLUDE_DIR
NAMES onnxruntime_cxx_api.h
PATHS
/usr/local/include
/opt/homebrew/include
/usr/include
${CMAKE_PREFIX_PATH}/include
PATH_SUFFIXES onnxruntime
)
find_library(ONNXRUNTIME_LIB
NAMES onnxruntime libonnxruntime
PATHS
/usr/local/lib
/opt/homebrew/lib
/usr/lib
${CMAKE_PREFIX_PATH}/lib
)
endif()
if(NOT ONNXRUNTIME_INCLUDE_DIR OR NOT ONNXRUNTIME_LIB)
message(FATAL_ERROR "ONNX Runtime not found. Please install it:\n"
" macOS: brew install onnxruntime\n"
" Ubuntu: See README.md for installation instructions")
endif()
message(STATUS "Found ONNX Runtime:")
message(STATUS " Include: ${ONNXRUNTIME_INCLUDE_DIR}")
message(STATUS " Library: ${ONNXRUNTIME_LIB}")
endif()
# nlohmann/json
find_package(nlohmann_json REQUIRED)
# Include directories
if(NOT onnxruntime_FOUND)
include_directories(${ONNXRUNTIME_INCLUDE_DIR})
endif()
# Helper library
add_library(tts_helper STATIC
helper.cpp
helper.h
)
if(onnxruntime_FOUND)
target_link_libraries(tts_helper
onnxruntime::onnxruntime
nlohmann_json::nlohmann_json
)
else()
target_include_directories(tts_helper PUBLIC ${ONNXRUNTIME_INCLUDE_DIR})
target_link_libraries(tts_helper
${ONNXRUNTIME_LIB}
nlohmann_json::nlohmann_json
)
endif()
# Enable OpenMP if available
if(OpenMP_CXX_FOUND)
target_link_libraries(tts_helper OpenMP::OpenMP_CXX)
message(STATUS "OpenMP enabled for parallel processing")
else()
message(WARNING "OpenMP not found - parallel processing will be disabled")
endif()
# Example executable
add_executable(example_onnx
example_onnx.cpp
)
if(onnxruntime_FOUND)
target_link_libraries(example_onnx
tts_helper
onnxruntime::onnxruntime
nlohmann_json::nlohmann_json
)
else()
target_link_libraries(example_onnx
tts_helper
${ONNXRUNTIME_LIB}
nlohmann_json::nlohmann_json
)
endif()
# Installation
install(TARGETS example_onnx DESTINATION bin)
install(TARGETS tts_helper DESTINATION lib)
install(FILES helper.h DESTINATION include)

139
cpp/README.md Normal file
View File

@@ -0,0 +1,139 @@
# Supertonic C++ Implementation
High-performance text-to-speech inference using ONNX Runtime.
## 📰 Update News
**2026.01.06** - 🎉 **Supertonic 2** released with multilingual support! Now supports English (`en`), Korean (`ko`), Spanish (`es`), Portuguese (`pt`), and French (`fr`). [Demo](https://huggingface.co/spaces/Supertone/supertonic-2) | [Models](https://huggingface.co/Supertone/supertonic-2)
**2025.12.10** - Added [6 new voice styles](https://huggingface.co/Supertone/supertonic/tree/b10dbaf18b316159be75b34d24f740008fddd381) (M3, M4, M5, F3, F4, F5). See [Voices](https://supertone-inc.github.io/supertonic-py/voices/) for details
**2025.12.08** - Optimized ONNX models via [OnnxSlim](https://github.com/inisis/OnnxSlim) now available on [Hugging Face Models](https://huggingface.co/Supertone/supertonic)
**2025.11.23** - Enhanced text preprocessing with comprehensive normalization, emoji removal, symbol replacement, and punctuation handling for improved synthesis quality.
**2025.11.19** - Added `--speed` parameter to control speech synthesis speed (default: 1.05, recommended range: 0.9-1.5).
**2025.11.19** - Added automatic text chunking for long-form inference. Long texts are split into chunks and synthesized with natural pauses.
## Requirements
- C++17 compiler, CMake 3.15+
- Libraries: ONNX Runtime, nlohmann/json
## Installation
**Ubuntu/Debian:**
> ⚠️ **Note:** Installation instructions not yet verified.
```bash
sudo apt-get install -y cmake g++ nlohmann-json3-dev
wget https://github.com/microsoft/onnxruntime/releases/download/v1.16.3/onnxruntime-linux-x64-1.16.3.tgz
tar -xzf onnxruntime-linux-x64-1.16.3.tgz
sudo cp -r onnxruntime-linux-x64-1.16.3/include/* /usr/local/include/
sudo cp -r onnxruntime-linux-x64-1.16.3/lib/* /usr/local/lib/
sudo ldconfig
```
**macOS:**
```bash
brew install cmake nlohmann-json onnxruntime
```
**Windows (vcpkg):**
> ⚠️ **Note:** Installation instructions not yet verified.
```powershell
vcpkg install nlohmann-json:x64-windows onnxruntime:x64-windows
vcpkg integrate install
```
## Building
```bash
cd cpp && mkdir build && cd build
cmake .. && cmake --build . --config Release
./example_onnx
```
## Basic Usage
### Example 1: Default Inference
Run inference with default settings:
```bash
./example_onnx
```
This will use:
- Voice style: `../assets/voice_styles/M1.json`
- Text: "This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
- Output directory: `results/`
- Total steps: 5
- Number of generations: 4
### Example 2: Batch Inference
Process multiple voice styles and texts at once:
```bash
./example_onnx \
--voice-style ../assets/voice_styles/M1.json,../assets/voice_styles/F1.json \
--text "The sun sets behind the mountains, painting the sky in shades of pink and orange.|오늘 아침에 공원을 산책했는데, 새소리와 바람 소리가 너무 좋아서 한참을 멈춰 서서 들었어요." \
--lang en,ko \
--batch
```
This will:
- Use `--batch` flag to enable batch processing mode
- Generate speech for 2 different voice-text pairs
- Use male voice style (M1.json) for the first English text
- Use female voice style (F1.json) for the second Korean text
- Process both samples in a single batch (automatic text chunking disabled)
### Example 3: High Quality Inference
Increase denoising steps for better quality:
```bash
./example_onnx \
--total-step 10 \
--voice-style ../assets/voice_styles/M1.json \
--text "Increasing the number of denoising steps improves the output's fidelity and overall quality."
```
This will:
- Use 10 denoising steps instead of the default 5
- Produce higher quality output at the cost of slower inference
### Example 4: Long-Form Inference
For long texts, the system automatically chunks the text into manageable segments and generates a single audio file:
```bash
./example_onnx \
--voice-style ../assets/voice_styles/M1.json \
--text "Once upon a time, in a small village nestled between rolling hills, there lived a young artist named Clara. Every morning, she would wake up before dawn to capture the first light of day. The golden rays streaming through her window inspired countless paintings. Her work was known throughout the region for its vibrant colors and emotional depth. People from far and wide came to see her gallery, and many said her paintings could tell stories that words never could."
```
This will:
- Automatically split the long text into smaller chunks (max 300 characters by default)
- Process each chunk separately while maintaining natural speech flow
- Insert brief silences (0.3 seconds) between chunks for natural pacing
- Combine all chunks into a single output audio file
**Note**: When using batch mode (`--batch`), automatic text chunking is disabled. Use non-batch mode for long-form text synthesis.
## Available Arguments
| Argument | Type | Default | Description |
|----------|------|---------|-------------|
| `--onnx-dir` | str | `../assets/onnx` | Path to ONNX model directory |
| `--total-step` | int | 5 | Number of denoising steps (higher = better quality, slower) |
| `--speed` | float | 1.05 | Speech speed factor (higher = faster, lower = slower) |
| `--n-test` | int | 4 | Number of times to generate each sample |
| `--voice-style` | str | `../assets/voice_styles/M1.json` | Voice style file path(s) (comma-separated for batch) |
| `--text` | str | (long default text) | Text(s) to synthesize (pipe-separated for batch) |
| `--lang` | str | `en` | Language(s) for text(s): `en`, `ko`, `es`, `pt`, `fr` (comma-separated for batch) |
| `--save-dir` | str | `results` | Output directory |
| `--batch` | flag | False | Enable batch mode (disables automatic text chunking) |
## Notes
- **Batch Processing**: The number of `--voice-style` files must match the number of `--text` entries
- **Multilingual Support**: Use `--lang` to specify language(s). Available: `en` (English), `ko` (Korean), `es` (Spanish), `pt` (Portuguese), `fr` (French)
- **Long-Form Inference**: Without `--batch` flag, long texts are automatically chunked and combined into a single audio file with natural pauses
- **Quality vs Speed**: Higher `--total-step` values produce better quality but take longer

121
cpp/example_onnx.cpp Normal file
View File

@@ -0,0 +1,121 @@
#include "helper.h"
#include <iostream>
#include <filesystem>
#include <algorithm>
#include <string>
#include <vector>
namespace fs = std::filesystem;
struct Args {
std::string onnx_dir = "../assets/onnx";
int total_step = 5;
float speed = 1.05f;
int n_test = 4;
std::vector<std::string> voice_style = {"../assets/voice_styles/M1.json"};
std::vector<std::string> text = {
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
};
std::vector<std::string> lang = {"en"};
std::string save_dir = "results";
bool batch = false;
};
auto splitString = [](const std::string& str, char delim) {
std::vector<std::string> result;
size_t start = 0, pos;
while ((pos = str.find(delim, start)) != std::string::npos) {
result.push_back(str.substr(start, pos - start));
start = pos + 1;
}
result.push_back(str.substr(start));
return result;
};
Args parseArgs(int argc, char* argv[]) {
Args args;
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "--onnx-dir" && i + 1 < argc) args.onnx_dir = argv[++i];
else if (arg == "--total-step" && i + 1 < argc) args.total_step = std::stoi(argv[++i]);
else if (arg == "--speed" && i + 1 < argc) args.speed = std::stof(argv[++i]);
else if (arg == "--n-test" && i + 1 < argc) args.n_test = std::stoi(argv[++i]);
else if (arg == "--voice-style" && i + 1 < argc) args.voice_style = splitString(argv[++i], ',');
else if (arg == "--text" && i + 1 < argc) args.text = splitString(argv[++i], '|');
else if (arg == "--lang" && i + 1 < argc) args.lang = splitString(argv[++i], ',');
else if (arg == "--save-dir" && i + 1 < argc) args.save_dir = argv[++i];
else if (arg == "--batch") args.batch = true;
}
return args;
}
int main(int argc, char* argv[]) {
std::cout << "=== TTS Inference with ONNX Runtime (C++) ===\n\n";
// --- 1. Parse arguments --- //
Args args = parseArgs(argc, argv);
int total_step = args.total_step;
float speed = args.speed;
int n_test = args.n_test;
std::string save_dir = args.save_dir;
std::vector<std::string> voice_style_paths = args.voice_style;
std::vector<std::string> text_list = args.text;
std::vector<std::string> lang_list = args.lang;
bool batch = args.batch;
if (voice_style_paths.size() != text_list.size()) {
std::cerr << "Error: Number of voice styles (" << voice_style_paths.size()
<< ") must match number of texts (" << text_list.size() << ")\n";
return 1;
}
int bsz = voice_style_paths.size();
// --- 2. Load Text to Speech --- //
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "TTS");
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault
);
auto text_to_speech = loadTextToSpeech(env, args.onnx_dir, false);
std::cout << std::endl;
// --- 3. Load Voice Style --- //
auto style = loadVoiceStyle(voice_style_paths, true);
// --- 4. Synthesize speech --- //
fs::create_directories(save_dir);
for (int n = 0; n < n_test; n++) {
std::cout << "\n[" << (n + 1) << "/" << n_test << "] Starting synthesis...\n";
auto result = timer("Generating speech from text", [&]() {
if (batch) {
return text_to_speech->batch(memory_info, text_list, lang_list, style, total_step, speed);
} else {
return text_to_speech->call(memory_info, text_list[0], lang_list[0], style, total_step, speed);
}
});
int sample_rate = text_to_speech->getSampleRate();
int wav_shape_1 = result.wav.size() / bsz;
for (int b = 0; b < bsz; b++) {
std::string fname = sanitizeFilename(text_list[b], 20) + "_" + std::to_string(n + 1) + ".wav";
int wav_len = static_cast<int>(sample_rate * result.duration[b]);
std::vector<float> wav_out(
result.wav.begin() + b * wav_shape_1,
result.wav.begin() + b * wav_shape_1 + wav_len
);
std::string output_path = save_dir + "/" + fname;
writeWavFile(output_path, wav_out, sample_rate);
std::cout << "Saved: " << output_path << "\n";
}
clearTensorBuffers();
}
std::cout << "\n=== Synthesis completed successfully! ===\n";
return 0;
}

1186
cpp/helper.cpp Normal file

File diff suppressed because it is too large Load Diff

229
cpp/helper.h Normal file
View File

@@ -0,0 +1,229 @@
#pragma once
#include <string>
#include <vector>
#include <memory>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <onnxruntime_cxx_api.h>
// Available languages for multilingual TTS
extern const std::vector<std::string> AVAILABLE_LANGS;
/**
* Configuration structure
*/
struct Config {
struct AEConfig {
int sample_rate;
int base_chunk_size;
} ae;
struct TTLConfig {
int chunk_compress_factor;
int latent_dim;
} ttl;
};
/**
* Unicode text processor
*/
class UnicodeProcessor {
public:
explicit UnicodeProcessor(const std::string& unicode_indexer_json_path);
// Process text list to text IDs and mask
void call(
const std::vector<std::string>& text_list,
const std::vector<std::string>& lang_list,
std::vector<std::vector<int64_t>>& text_ids,
std::vector<std::vector<std::vector<float>>>& text_mask
);
private:
std::vector<int64_t> indexer_;
std::string preprocessText(const std::string& text, const std::string& lang);
std::vector<uint16_t> textToUnicodeValues(const std::string& text);
std::vector<std::vector<std::vector<float>>> getTextMask(
const std::vector<int64_t>& text_ids_lengths
);
};
/**
* Style class
*/
class Style {
public:
Style(const std::vector<float>& ttl_data, const std::vector<int64_t>& ttl_shape,
const std::vector<float>& dp_data, const std::vector<int64_t>& dp_shape);
const std::vector<float>& getTtlData() const { return ttl_data_; }
const std::vector<float>& getDpData() const { return dp_data_; }
const std::vector<int64_t>& getTtlShape() const { return ttl_shape_; }
const std::vector<int64_t>& getDpShape() const { return dp_shape_; }
private:
std::vector<float> ttl_data_;
std::vector<float> dp_data_;
std::vector<int64_t> ttl_shape_;
std::vector<int64_t> dp_shape_;
};
/**
* TextToSpeech class
*/
class TextToSpeech {
public:
TextToSpeech(
const Config& cfgs,
UnicodeProcessor* text_processor,
Ort::Session* dp_ort,
Ort::Session* text_enc_ort,
Ort::Session* vector_est_ort,
Ort::Session* vocoder_ort
);
struct SynthesisResult {
std::vector<float> wav;
std::vector<float> duration;
};
SynthesisResult call(
Ort::MemoryInfo& memory_info,
const std::string& text,
const std::string& lang,
const Style& style,
int total_step,
float speed = 1.05f,
float silence_duration = 0.3f
);
SynthesisResult batch(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const std::vector<std::string>& lang_list,
const Style& style,
int total_step,
float speed = 1.05f
);
int getSampleRate() const { return sample_rate_; }
private:
SynthesisResult _infer(
Ort::MemoryInfo& memory_info,
const std::vector<std::string>& text_list,
const std::vector<std::string>& lang_list,
const Style& style,
int total_step,
float speed = 1.05f
);
Config cfgs_;
UnicodeProcessor* text_processor_;
Ort::Session* dp_ort_;
Ort::Session* text_enc_ort_;
Ort::Session* vector_est_ort_;
Ort::Session* vocoder_ort_;
int sample_rate_;
int base_chunk_size_;
int chunk_compress_factor_;
int ldim_;
void sampleNoisyLatent(
const std::vector<float>& duration,
std::vector<std::vector<std::vector<float>>>& noisy_latent,
std::vector<std::vector<std::vector<float>>>& latent_mask
);
};
// Utility functions
std::vector<std::vector<std::vector<float>>> lengthToMask(
const std::vector<int64_t>& lengths, int max_len = -1
);
std::vector<std::vector<std::vector<float>>> getLatentMask(
const std::vector<int64_t>& wav_lengths,
int base_chunk_size,
int chunk_compress_factor
);
// ONNX model loading
struct OnnxModels {
std::unique_ptr<Ort::Session> dp;
std::unique_ptr<Ort::Session> text_enc;
std::unique_ptr<Ort::Session> vector_est;
std::unique_ptr<Ort::Session> vocoder;
};
std::unique_ptr<Ort::Session> loadOnnx(
Ort::Env& env,
const std::string& onnx_path,
const Ort::SessionOptions& opts
);
OnnxModels loadOnnxAll(
Ort::Env& env,
const std::string& onnx_dir,
const Ort::SessionOptions& opts
);
// Configuration and processor loading
Config loadCfgs(const std::string& onnx_dir);
std::unique_ptr<UnicodeProcessor> loadTextProcessor(const std::string& onnx_dir);
// Voice style loading
Style loadVoiceStyle(const std::vector<std::string>& voice_style_paths, bool verbose = false);
// TextToSpeech loading
std::unique_ptr<TextToSpeech> loadTextToSpeech(
Ort::Env& env,
const std::string& onnx_dir,
bool use_gpu = false
);
// WAV file writing
void writeWavFile(
const std::string& filename,
const std::vector<float>& audio_data,
int sample_rate
);
// Tensor conversion utilities
void clearTensorBuffers();
Ort::Value arrayToTensor(
Ort::MemoryInfo& memory_info,
const std::vector<std::vector<std::vector<float>>>& array,
const std::vector<int64_t>& dims
);
Ort::Value intArrayToTensor(
Ort::MemoryInfo& memory_info,
const std::vector<std::vector<int64_t>>& array,
const std::vector<int64_t>& dims
);
// JSON loading helpers
std::vector<int64_t> loadJsonInt64(const std::string& file_path);
// Timer utility
template<typename Func>
auto timer(const std::string& name, Func&& func) -> decltype(func()) {
auto start = std::chrono::high_resolution_clock::now();
std::cout << name << "..." << std::endl;
auto result = func();
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << " -> " << name << " completed in "
<< std::fixed << std::setprecision(2) << elapsed.count() << " sec" << std::endl;
return result;
}
// Sanitize filename
std::string sanitizeFilename(const std::string& text, int max_len);
// Chunk text into manageable segments
std::vector<std::string> chunkText(const std::string& text, int max_len = 300);