import argparse import os import soundfile as sf from helper import load_text_to_speech, timer, sanitize_filename, load_voice_style def parse_args(): parser = argparse.ArgumentParser(description="TTS Inference with ONNX") # Device settings parser.add_argument( "--use-gpu", action="store_true", help="Use GPU for inference (default: CPU)" ) # Model settings parser.add_argument( "--onnx-dir", type=str, default="assets/onnx", help="Path to ONNX model directory", ) # Synthesis parameters parser.add_argument( "--total-step", type=int, default=5, help="Number of denoising steps" ) parser.add_argument( "--speed", type=float, default=1.05, help="Speech speed (default: 1.05, higher = faster)", ) parser.add_argument( "--n-test", type=int, default=4, help="Number of times to generate" ) # Batch processing parser.add_argument("--batch", action="store_true", help="Batch processing") # Input/Output parser.add_argument( "--voice-style", type=str, nargs="+", default=["assets/voice_styles/M1.json"], help="Voice style file path(s). Can specify multiple files for batch processing", ) parser.add_argument( "--text", type=str, nargs="+", default=[ "This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen." ], help="Text(s) to synthesize. Can specify multiple texts for batch processing", ) parser.add_argument( "--lang", type=str, nargs="+", default=["en"], help="Language(s) of the text(s). Can specify multiple languages for batch processing", ) parser.add_argument( "--save-dir", type=str, default="results", help="Output directory" ) return parser.parse_args() print("=== TTS Inference with ONNX Runtime (Python) ===\n") # --- 1. Parse arguments --- # args = parse_args() total_step = args.total_step speed = args.speed n_test = args.n_test save_dir = args.save_dir voice_style_paths = args.voice_style text_list = args.text lang_list = args.lang batch = args.batch assert len(voice_style_paths) == len( text_list ), f"Number of voice styles ({len(voice_style_paths)}) must match number of texts ({len(text_list)})" bsz = len(voice_style_paths) # --- 2. Load Text to Speech --- # text_to_speech = load_text_to_speech(args.onnx_dir, args.use_gpu) # --- 3. Load Voice Style --- # style = load_voice_style(voice_style_paths, verbose=True) # --- 4. Synthesize Speech --- # for n in range(n_test): print(f"\n[{n+1}/{n_test}] Starting synthesis...") with timer("Generating speech from text"): if batch: wav, duration = text_to_speech.batch( text_list, lang_list, style, total_step, speed ) else: wav, duration = text_to_speech( text_list[0], lang_list[0], style, total_step, speed ) if not os.path.exists(save_dir): os.makedirs(save_dir) for b in range(bsz): fname = f"{sanitize_filename(text_list[b], 20)}_{n+1}.wav" w = wav[b, : int(text_to_speech.sample_rate * duration[b].item())] # [T_trim] sf.write(os.path.join(save_dir, fname), w, text_to_speech.sample_rate) print(f"Saved: {save_dir}/{fname}") print("\n=== Synthesis completed successfully! ===")