initial commit

2026-01-25 18:58:40 +09:00
commit 77af47274c
101 changed files with 16247 additions and 0 deletions
--- a/py/example_onnx.py
+++ b/py/example_onnx.py
@@ -0,0 +1,116 @@
+import argparse
+import os
+
+import soundfile as sf
+
+from helper import load_text_to_speech, timer, sanitize_filename, load_voice_style
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="TTS Inference with ONNX")
+
+    # Device settings
+    parser.add_argument(
+        "--use-gpu", action="store_true", help="Use GPU for inference (default: CPU)"
+    )
+
+    # Model settings
+    parser.add_argument(
+        "--onnx-dir",
+        type=str,
+        default="assets/onnx",
+        help="Path to ONNX model directory",
+    )
+
+    # Synthesis parameters
+    parser.add_argument(
+        "--total-step", type=int, default=5, help="Number of denoising steps"
+    )
+    parser.add_argument(
+        "--speed",
+        type=float,
+        default=1.05,
+        help="Speech speed (default: 1.05, higher = faster)",
+    )
+    parser.add_argument(
+        "--n-test", type=int, default=4, help="Number of times to generate"
+    )
+
+    # Batch processing
+    parser.add_argument("--batch", action="store_true", help="Batch processing")
+
+    # Input/Output
+    parser.add_argument(
+        "--voice-style",
+        type=str,
+        nargs="+",
+        default=["assets/voice_styles/M1.json"],
+        help="Voice style file path(s). Can specify multiple files for batch processing",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        nargs="+",
+        default=[
+            "This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
+        ],
+        help="Text(s) to synthesize. Can specify multiple texts for batch processing",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        nargs="+",
+        default=["en"],
+        help="Language(s) of the text(s). Can specify multiple languages for batch processing",
+    )
+    parser.add_argument(
+        "--save-dir", type=str, default="results", help="Output directory"
+    )
+
+    return parser.parse_args()
+
+
+print("=== TTS Inference with ONNX Runtime (Python) ===\n")
+
+# --- 1. Parse arguments --- #
+args = parse_args()
+total_step = args.total_step
+speed = args.speed
+n_test = args.n_test
+save_dir = args.save_dir
+voice_style_paths = args.voice_style
+text_list = args.text
+lang_list = args.lang
+batch = args.batch
+
+assert len(voice_style_paths) == len(
+    text_list
+), f"Number of voice styles ({len(voice_style_paths)}) must match number of texts ({len(text_list)})"
+bsz = len(voice_style_paths)
+
+# --- 2. Load Text to Speech --- #
+text_to_speech = load_text_to_speech(args.onnx_dir, args.use_gpu)
+
+# --- 3. Load Voice Style --- #
+style = load_voice_style(voice_style_paths, verbose=True)
+
+# --- 4. Synthesize Speech --- #
+for n in range(n_test):
+    print(f"\n[{n+1}/{n_test}] Starting synthesis...")
+    with timer("Generating speech from text"):
+        if batch:
+            wav, duration = text_to_speech.batch(
+                text_list, lang_list, style, total_step, speed
+            )
+        else:
+            wav, duration = text_to_speech(
+                text_list[0], lang_list[0], style, total_step, speed
+            )
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    for b in range(bsz):
+        fname = f"{sanitize_filename(text_list[b], 20)}_{n+1}.wav"
+        w = wav[b, : int(text_to_speech.sample_rate * duration[b].item())]  # [T_trim]
+        sf.write(os.path.join(save_dir, fname), w, text_to_speech.sample_rate)
+        print(f"Saved: {save_dir}/{fname}")
+print("\n=== Synthesis completed successfully! ===")