117 lines
3.5 KiB
Python
117 lines
3.5 KiB
Python
import argparse
|
|
import os
|
|
|
|
import soundfile as sf
|
|
|
|
from helper import load_text_to_speech, timer, sanitize_filename, load_voice_style
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="TTS Inference with ONNX")
|
|
|
|
# Device settings
|
|
parser.add_argument(
|
|
"--use-gpu", action="store_true", help="Use GPU for inference (default: CPU)"
|
|
)
|
|
|
|
# Model settings
|
|
parser.add_argument(
|
|
"--onnx-dir",
|
|
type=str,
|
|
default="assets/onnx",
|
|
help="Path to ONNX model directory",
|
|
)
|
|
|
|
# Synthesis parameters
|
|
parser.add_argument(
|
|
"--total-step", type=int, default=5, help="Number of denoising steps"
|
|
)
|
|
parser.add_argument(
|
|
"--speed",
|
|
type=float,
|
|
default=1.05,
|
|
help="Speech speed (default: 1.05, higher = faster)",
|
|
)
|
|
parser.add_argument(
|
|
"--n-test", type=int, default=4, help="Number of times to generate"
|
|
)
|
|
|
|
# Batch processing
|
|
parser.add_argument("--batch", action="store_true", help="Batch processing")
|
|
|
|
# Input/Output
|
|
parser.add_argument(
|
|
"--voice-style",
|
|
type=str,
|
|
nargs="+",
|
|
default=["assets/voice_styles/M1.json"],
|
|
help="Voice style file path(s). Can specify multiple files for batch processing",
|
|
)
|
|
parser.add_argument(
|
|
"--text",
|
|
type=str,
|
|
nargs="+",
|
|
default=[
|
|
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
|
|
],
|
|
help="Text(s) to synthesize. Can specify multiple texts for batch processing",
|
|
)
|
|
parser.add_argument(
|
|
"--lang",
|
|
type=str,
|
|
nargs="+",
|
|
default=["en"],
|
|
help="Language(s) of the text(s). Can specify multiple languages for batch processing",
|
|
)
|
|
parser.add_argument(
|
|
"--save-dir", type=str, default="results", help="Output directory"
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
print("=== TTS Inference with ONNX Runtime (Python) ===\n")
|
|
|
|
# --- 1. Parse arguments --- #
|
|
args = parse_args()
|
|
total_step = args.total_step
|
|
speed = args.speed
|
|
n_test = args.n_test
|
|
save_dir = args.save_dir
|
|
voice_style_paths = args.voice_style
|
|
text_list = args.text
|
|
lang_list = args.lang
|
|
batch = args.batch
|
|
|
|
assert len(voice_style_paths) == len(
|
|
text_list
|
|
), f"Number of voice styles ({len(voice_style_paths)}) must match number of texts ({len(text_list)})"
|
|
bsz = len(voice_style_paths)
|
|
|
|
# --- 2. Load Text to Speech --- #
|
|
text_to_speech = load_text_to_speech(args.onnx_dir, args.use_gpu)
|
|
|
|
# --- 3. Load Voice Style --- #
|
|
style = load_voice_style(voice_style_paths, verbose=True)
|
|
|
|
# --- 4. Synthesize Speech --- #
|
|
for n in range(n_test):
|
|
print(f"\n[{n+1}/{n_test}] Starting synthesis...")
|
|
with timer("Generating speech from text"):
|
|
if batch:
|
|
wav, duration = text_to_speech.batch(
|
|
text_list, lang_list, style, total_step, speed
|
|
)
|
|
else:
|
|
wav, duration = text_to_speech(
|
|
text_list[0], lang_list[0], style, total_step, speed
|
|
)
|
|
if not os.path.exists(save_dir):
|
|
os.makedirs(save_dir)
|
|
for b in range(bsz):
|
|
fname = f"{sanitize_filename(text_list[b], 20)}_{n+1}.wav"
|
|
w = wav[b, : int(text_to_speech.sample_rate * duration[b].item())] # [T_trim]
|
|
sf.write(os.path.join(save_dir, fname), w, text_to_speech.sample_rate)
|
|
print(f"Saved: {save_dir}/{fname}")
|
|
print("\n=== Synthesis completed successfully! ===")
|