initial commit
This commit is contained in:
171
csharp/ExampleONNX.cs
Normal file
171
csharp/ExampleONNX.cs
Normal file
@@ -0,0 +1,171 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Media;
|
||||
|
||||
namespace Supertonic
|
||||
{
|
||||
class Program
|
||||
{
|
||||
class Args
|
||||
{
|
||||
public bool UseGpu { get; set; } = false;
|
||||
public string OnnxDir { get; set; } = "./assets/onnx";
|
||||
public int TotalStep { get; set; } = 5;
|
||||
public float Speed { get; set; } = 1.05f;
|
||||
public int NTest { get; set; } = 4;
|
||||
public List<string> VoiceStyle { get; set; } = new List<string> { "assets/voice_styles/F2.json" };
|
||||
public List<string> Text { get; set; } = new List<string>
|
||||
{
|
||||
"동해물과 백두산이 마르고 닳도록 하느님이 보우하사. 우리 나라 만세~~"
|
||||
};
|
||||
public List<string> Lang { get; set; } = new List<string> { "ko" };
|
||||
public string SaveDir { get; set; } = "results";
|
||||
public bool Batch { get; set; } = false;
|
||||
public int? Seed { get; set; } = null;
|
||||
public float PreSilence { get; set; } = 0.2f;
|
||||
}
|
||||
|
||||
static Args ParseArgs(string[] args)
|
||||
{
|
||||
var result = new Args();
|
||||
|
||||
for (int i = 0; i < args.Length; i++)
|
||||
{
|
||||
switch (args[i])
|
||||
{
|
||||
case "--use-gpu":
|
||||
result.UseGpu = true;
|
||||
break;
|
||||
case "--batch":
|
||||
result.Batch = true;
|
||||
break;
|
||||
case "--onnx-dir" when i + 1 < args.Length:
|
||||
result.OnnxDir = args[++i];
|
||||
break;
|
||||
case "--total-step" when i + 1 < args.Length:
|
||||
result.TotalStep = int.Parse(args[++i]);
|
||||
break;
|
||||
case "--speed" when i + 1 < args.Length:
|
||||
result.Speed = float.Parse(args[++i]);
|
||||
break;
|
||||
case "--n-test" when i + 1 < args.Length:
|
||||
result.NTest = int.Parse(args[++i]);
|
||||
break;
|
||||
case "--voice-style" when i + 1 < args.Length:
|
||||
result.VoiceStyle = args[++i].Split(',').ToList();
|
||||
break;
|
||||
case "--text" when i + 1 < args.Length:
|
||||
result.Text = args[++i].Split('|').ToList();
|
||||
break;
|
||||
case "--lang" when i + 1 < args.Length:
|
||||
result.Lang = args[++i].Split(',').ToList();
|
||||
break;
|
||||
case "--save-dir" when i + 1 < args.Length:
|
||||
result.SaveDir = args[++i];
|
||||
break;
|
||||
case "--seed" when i + 1 < args.Length:
|
||||
result.Seed = int.Parse(args[++i]);
|
||||
break;
|
||||
case "--pre-silence" when i + 1 < args.Length:
|
||||
result.PreSilence = float.Parse(args[++i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void Main(string[] args)
|
||||
{
|
||||
Console.WriteLine("=== TTS Inference with ONNX Runtime (C#) ===\n");
|
||||
Console.WriteLine("sample seed : 371279630");
|
||||
|
||||
// --- 1. Parse arguments --- //
|
||||
var parsedArgs = ParseArgs(args);
|
||||
int totalStep = parsedArgs.TotalStep;
|
||||
float speed = parsedArgs.Speed;
|
||||
int nTest = parsedArgs.NTest;
|
||||
string saveDir = parsedArgs.SaveDir;
|
||||
var voiceStylePaths = parsedArgs.VoiceStyle;
|
||||
var textList = parsedArgs.Text;
|
||||
var langList = parsedArgs.Lang;
|
||||
bool batch = parsedArgs.Batch;
|
||||
|
||||
if (voiceStylePaths.Count != textList.Count)
|
||||
{
|
||||
throw new ArgumentException(
|
||||
$"Number of voice styles ({voiceStylePaths.Count}) must match number of texts ({textList.Count})");
|
||||
}
|
||||
int bsz = voiceStylePaths.Count;
|
||||
|
||||
// --- 2. Load Text to Speech --- //
|
||||
var textToSpeech = Helper.LoadTextToSpeech(parsedArgs.OnnxDir, parsedArgs.UseGpu);
|
||||
Console.WriteLine();
|
||||
|
||||
// --- 3. Load Voice Style --- //
|
||||
var style = Helper.LoadVoiceStyle(voiceStylePaths, verbose: true);
|
||||
|
||||
// --- 4. Synthesize speech --- //
|
||||
Random seedGenerator = new Random();
|
||||
for (int n = 0; n < nTest; n++)
|
||||
{
|
||||
int currentSeed = parsedArgs.Seed ?? seedGenerator.Next();
|
||||
Console.WriteLine($"\n[{n + 1}/{nTest}] Starting synthesis (Seed: {currentSeed})...");
|
||||
|
||||
var (wav, duration) = Helper.Timer("Generating speech from text", () =>
|
||||
{
|
||||
if (batch)
|
||||
{
|
||||
return textToSpeech.Batch(textList, langList, style, totalStep, speed, currentSeed);
|
||||
}
|
||||
else
|
||||
{
|
||||
return textToSpeech.Call(textList[0], langList[0], style, totalStep, speed, seed: currentSeed);
|
||||
}
|
||||
});
|
||||
|
||||
if (!Directory.Exists(saveDir))
|
||||
{
|
||||
Directory.CreateDirectory(saveDir);
|
||||
}
|
||||
|
||||
for (int b = 0; b < bsz; b++)
|
||||
{
|
||||
string fname = $"{Helper.SanitizeFilename(textList[b], 20)}_{n + 1}_s{currentSeed}.wav";
|
||||
|
||||
int wavLen = (int)(textToSpeech.SampleRate * duration[b]);
|
||||
|
||||
// --- Add Pre-Silence (Delay) --- //
|
||||
int silenceSamples = (int)(textToSpeech.SampleRate * parsedArgs.PreSilence);
|
||||
var wavOut = new float[wavLen + silenceSamples];
|
||||
|
||||
// The array is initialized to 0 by default, so we just copy the audio after the silence
|
||||
Array.Copy(wav, b * wav.Length / bsz, wavOut, silenceSamples, Math.Min(wavLen, wav.Length / bsz));
|
||||
|
||||
string outputPath = Path.Combine(saveDir, fname);
|
||||
Helper.WriteWavFile(outputPath, wavOut, textToSpeech.SampleRate);
|
||||
Console.WriteLine($"Saved: {outputPath}");
|
||||
|
||||
// --- Play the generated audio --- //
|
||||
try
|
||||
{
|
||||
using (var player = new SoundPlayer(outputPath))
|
||||
{
|
||||
Console.WriteLine("Playing audio...");
|
||||
player.PlaySync();
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"Warning: Could not play audio. {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine("\n=== Synthesis completed successfully! ===");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user