Files
Supertonic/py/example_onnx.py
2026-01-25 18:58:40 +09:00

117 lines
3.5 KiB
Python

import argparse
import os
import soundfile as sf
from helper import load_text_to_speech, timer, sanitize_filename, load_voice_style
def parse_args():
parser = argparse.ArgumentParser(description="TTS Inference with ONNX")
# Device settings
parser.add_argument(
"--use-gpu", action="store_true", help="Use GPU for inference (default: CPU)"
)
# Model settings
parser.add_argument(
"--onnx-dir",
type=str,
default="assets/onnx",
help="Path to ONNX model directory",
)
# Synthesis parameters
parser.add_argument(
"--total-step", type=int, default=5, help="Number of denoising steps"
)
parser.add_argument(
"--speed",
type=float,
default=1.05,
help="Speech speed (default: 1.05, higher = faster)",
)
parser.add_argument(
"--n-test", type=int, default=4, help="Number of times to generate"
)
# Batch processing
parser.add_argument("--batch", action="store_true", help="Batch processing")
# Input/Output
parser.add_argument(
"--voice-style",
type=str,
nargs="+",
default=["assets/voice_styles/M1.json"],
help="Voice style file path(s). Can specify multiple files for batch processing",
)
parser.add_argument(
"--text",
type=str,
nargs="+",
default=[
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
],
help="Text(s) to synthesize. Can specify multiple texts for batch processing",
)
parser.add_argument(
"--lang",
type=str,
nargs="+",
default=["en"],
help="Language(s) of the text(s). Can specify multiple languages for batch processing",
)
parser.add_argument(
"--save-dir", type=str, default="results", help="Output directory"
)
return parser.parse_args()
print("=== TTS Inference with ONNX Runtime (Python) ===\n")
# --- 1. Parse arguments --- #
args = parse_args()
total_step = args.total_step
speed = args.speed
n_test = args.n_test
save_dir = args.save_dir
voice_style_paths = args.voice_style
text_list = args.text
lang_list = args.lang
batch = args.batch
assert len(voice_style_paths) == len(
text_list
), f"Number of voice styles ({len(voice_style_paths)}) must match number of texts ({len(text_list)})"
bsz = len(voice_style_paths)
# --- 2. Load Text to Speech --- #
text_to_speech = load_text_to_speech(args.onnx_dir, args.use_gpu)
# --- 3. Load Voice Style --- #
style = load_voice_style(voice_style_paths, verbose=True)
# --- 4. Synthesize Speech --- #
for n in range(n_test):
print(f"\n[{n+1}/{n_test}] Starting synthesis...")
with timer("Generating speech from text"):
if batch:
wav, duration = text_to_speech.batch(
text_list, lang_list, style, total_step, speed
)
else:
wav, duration = text_to_speech(
text_list[0], lang_list[0], style, total_step, speed
)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for b in range(bsz):
fname = f"{sanitize_filename(text_list[b], 20)}_{n+1}.wav"
w = wav[b, : int(text_to_speech.sample_rate * duration[b].item())] # [T_trim]
sf.write(os.path.join(save_dir, fname), w, text_to_speech.sample_rate)
print(f"Saved: {save_dir}/{fname}")
print("\n=== Synthesis completed successfully! ===")