diff --git a/HMI/SubProject/tts/App.config b/HMI/SubProject/tts/App.config new file mode 100644 index 0000000..e9aa520 --- /dev/null +++ b/HMI/SubProject/tts/App.config @@ -0,0 +1,6 @@ + + + + + + diff --git a/HMI/SubProject/tts/Helper.cs b/HMI/SubProject/tts/Helper.cs new file mode 100644 index 0000000..d13293b --- /dev/null +++ b/HMI/SubProject/tts/Helper.cs @@ -0,0 +1,889 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Text.RegularExpressions; +using Microsoft.ML.OnnxRuntime; +using Microsoft.ML.OnnxRuntime.Tensors; + +namespace Supertonic.WinForms +{ + // Available languages for multilingual TTS + public static class Languages + { + public static readonly string[] Available = { "en", "ko", "es", "pt", "fr" }; + } + + // ============================================================================ + // Configuration classes + // ============================================================================ + + public class Config + { + public AEConfig AE { get; set; } = null; + public TTLConfig TTL { get; set; } = null; + + public class AEConfig + { + public int SampleRate { get; set; } + public int BaseChunkSize { get; set; } + } + + public class TTLConfig + { + public int ChunkCompressFactor { get; set; } + public int LatentDim { get; set; } + } + } + + // ============================================================================ + // Style class + // ============================================================================ + + public class Style + { + public float[] Ttl { get; set; } + public long[] TtlShape { get; set; } + public float[] Dp { get; set; } + public long[] DpShape { get; set; } + + public Style(float[] ttl, long[] ttlShape, float[] dp, long[] dpShape) + { + Ttl = ttl; + TtlShape = ttlShape; + Dp = dp; + DpShape = dpShape; + } + } + + // ============================================================================ + // Unicode text processor + // ============================================================================ + + public class UnicodeProcessor + { + private readonly Dictionary _indexer; + + public UnicodeProcessor(string unicodeIndexerPath) + { + var json = File.ReadAllText(unicodeIndexerPath); + var indexerArray = JsonSerializer.Deserialize(json) ?? throw new Exception("Failed to load indexer"); + _indexer = new Dictionary(); + for (int i = 0; i < indexerArray.Length; i++) + { + _indexer[i] = indexerArray[i]; + } + } + + private static string RemoveEmojis(string text) + { + var result = new StringBuilder(); + for (int i = 0; i < text.Length; i++) + { + int codePoint; + if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1])) + { + // Get the full code point from surrogate pair + codePoint = char.ConvertToUtf32(text[i], text[i + 1]); + i++; // Skip the low surrogate + } + else + { + codePoint = text[i]; + } + + // Check if code point is in emoji ranges + bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) || + (codePoint >= 0x1F300 && codePoint <= 0x1F5FF) || + (codePoint >= 0x1F680 && codePoint <= 0x1F6FF) || + (codePoint >= 0x1F700 && codePoint <= 0x1F77F) || + (codePoint >= 0x1F780 && codePoint <= 0x1F7FF) || + (codePoint >= 0x1F800 && codePoint <= 0x1F8FF) || + (codePoint >= 0x1F900 && codePoint <= 0x1F9FF) || + (codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) || + (codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) || + (codePoint >= 0x2600 && codePoint <= 0x26FF) || + (codePoint >= 0x2700 && codePoint <= 0x27BF) || + (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF); + + if (!isEmoji) + { + if (codePoint > 0xFFFF) + { + // Add back as surrogate pair + result.Append(char.ConvertFromUtf32(codePoint)); + } + else + { + result.Append((char)codePoint); + } + } + } + return result.ToString(); + } + + private string PreprocessText(string text, string lang) + { + // TODO: Need advanced normalizer for better performance + text = text.Normalize(NormalizationForm.FormKD); + + // Remove emojis (wide Unicode range) + // C# doesn't support \u{...} syntax in regex, so we use character filtering instead + text = RemoveEmojis(text); + + // Replace various dashes and symbols + var replacements = new Dictionary + { + {"–", "-"}, // en dash + {"‑", "-"}, // non-breaking hyphen + {"—", "-"}, // em dash + {"_", " "}, // underscore + {"\u201C", "\""}, // left double quote + {"\u201D", "\""}, // right double quote + {"\u2018", "'"}, // left single quote + {"\u2019", "'"}, // right single quote + {"´", "'"}, // acute accent + {"`", "'"}, // grave accent + {"[", " "}, // left bracket + {"]", " "}, // right bracket + {"|", " "}, // vertical bar + {"/", " "}, // slash + {"#", " "}, // hash + {"→", " "}, // right arrow + {"←", " "}, // left arrow + }; + + foreach (var kvp in replacements) + { + text = text.Replace(kvp.Key, kvp.Value); + } + + // Remove special symbols + text = Regex.Replace(text, @"[♥☆♡©\\]", ""); + + // Replace known expressions + var exprReplacements = new Dictionary + { + {"@", " at "}, + {"e.g.,", "for example, "}, + {"i.e.,", "that is, "}, + }; + + foreach (var kvp in exprReplacements) + { + text = text.Replace(kvp.Key, kvp.Value); + } + + // Fix spacing around punctuation + text = Regex.Replace(text, @" ,", ","); + text = Regex.Replace(text, @" \.", "."); + text = Regex.Replace(text, @" !", "!"); + text = Regex.Replace(text, @" \?", "?"); + text = Regex.Replace(text, @" ;", ";"); + text = Regex.Replace(text, @" :", ":"); + text = Regex.Replace(text, @" '", "'"); + + // Remove duplicate quotes + while (text.Contains("\"\"")) + { + text = text.Replace("\"\"", "\""); + } + while (text.Contains("''")) + { + text = text.Replace("''", "'"); + } + while (text.Contains("``")) + { + text = text.Replace("``", "`"); + } + + // Remove extra spaces + text = Regex.Replace(text, @"\s+", " ").Trim(); + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$")) + { + text += "."; + } + + // Validate language + if (!Languages.Available.Contains(lang)) + { + throw new ArgumentException($"Invalid language: {lang}. Available: {string.Join(", ", Languages.Available)}"); + } + + // Wrap text with language tags + text = $"<{lang}>" + text + $""; + + return text; + } + + private int[] TextToUnicodeValues(string text) + { + return text.Select(c => (int)c).ToArray(); + } + + private float[][][] GetTextMask(long[] textIdsLengths) + { + return Helper.LengthToMask(textIdsLengths); + } + + public (long[][] textIds, float[][][] textMask) Call(List textList, List langList) + { + var processedTexts = textList.Select((t, i) => PreprocessText(t, langList[i])).ToList(); + var textIdsLengths = processedTexts.Select(t => (long)t.Length).ToArray(); + long maxLen = textIdsLengths.Max(); + + var textIds = new long[textList.Count][]; + for (int i = 0; i < processedTexts.Count; i++) + { + textIds[i] = new long[maxLen]; + var unicodeVals = TextToUnicodeValues(processedTexts[i]); + for (int j = 0; j < unicodeVals.Length; j++) + { + if (_indexer.TryGetValue(unicodeVals[j], out long val)) + { + textIds[i][j] = val; + } + } + } + + var textMask = GetTextMask(textIdsLengths); + return (textIds, textMask); + } + } + + // ============================================================================ + // TextToSpeech class + // ============================================================================ + + public class TextToSpeech + { + private readonly Config _cfgs; + private readonly UnicodeProcessor _textProcessor; + private readonly InferenceSession _dpOrt; + private readonly InferenceSession _textEncOrt; + private readonly InferenceSession _vectorEstOrt; + private readonly InferenceSession _vocoderOrt; + public readonly int SampleRate; + private readonly int _baseChunkSize; + private readonly int _chunkCompressFactor; + private readonly int _ldim; + + public TextToSpeech( + Config cfgs, + UnicodeProcessor textProcessor, + InferenceSession dpOrt, + InferenceSession textEncOrt, + InferenceSession vectorEstOrt, + InferenceSession vocoderOrt) + { + _cfgs = cfgs; + _textProcessor = textProcessor; + _dpOrt = dpOrt; + _textEncOrt = textEncOrt; + _vectorEstOrt = vectorEstOrt; + _vocoderOrt = vocoderOrt; + SampleRate = cfgs.AE.SampleRate; + _baseChunkSize = cfgs.AE.BaseChunkSize; + _chunkCompressFactor = cfgs.TTL.ChunkCompressFactor; + _ldim = cfgs.TTL.LatentDim; + } + + private (float[][][] noisyLatent, float[][][] latentMask) SampleNoisyLatent(float[] duration) + { + int bsz = duration.Length; + float wavLenMax = duration.Max() * SampleRate; + var wavLengths = duration.Select(d => (long)(d * SampleRate)).ToArray(); + int chunkSize = _baseChunkSize * _chunkCompressFactor; + int latentLen = (int)((wavLenMax + chunkSize - 1) / chunkSize); + int latentDim = _ldim * _chunkCompressFactor; + + // Generate random noise + var random = new Random(); + var noisyLatent = new float[bsz][][]; + for (int b = 0; b < bsz; b++) + { + noisyLatent[b] = new float[latentDim][]; + for (int d = 0; d < latentDim; d++) + { + noisyLatent[b][d] = new float[latentLen]; + for (int t = 0; t < latentLen; t++) + { + // Box-Muller transform for normal distribution + double u1 = 1.0 - random.NextDouble(); + double u2 = 1.0 - random.NextDouble(); + noisyLatent[b][d][t] = (float)(Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Cos(2.0 * Math.PI * u2)); + } + } + } + + var latentMask = Helper.GetLatentMask(wavLengths, _baseChunkSize, _chunkCompressFactor); + + // Apply mask + for (int b = 0; b < bsz; b++) + { + for (int d = 0; d < latentDim; d++) + { + for (int t = 0; t < latentLen; t++) + { + noisyLatent[b][d][t] *= latentMask[b][0][t]; + } + } + } + + return (noisyLatent, latentMask); + } + + private (float[] wav, float[] duration) _Infer(List textList, List langList, Style style, int totalStep, float speed = 1.05f) + { + int bsz = textList.Count; + if (bsz != style.TtlShape[0]) + { + throw new ArgumentException("Number of texts must match number of style vectors"); + } + + // Process text + var (textIds, textMask) = _textProcessor.Call(textList, langList); + var textIdsShape = new long[] { bsz, textIds[0].Length }; + var textMaskShape = new long[] { bsz, 1, textMask[0][0].Length }; + + var textIdsTensor = Helper.IntArrayToTensor(textIds, textIdsShape); + var textMaskTensor = Helper.ArrayToTensor(textMask, textMaskShape); + + var styleTtlTensor = new DenseTensor(style.Ttl, style.TtlShape.Select(x => (int)x).ToArray()); + var styleDpTensor = new DenseTensor(style.Dp, style.DpShape.Select(x => (int)x).ToArray()); + + // Run duration predictor + var dpInputs = new List + { + NamedOnnxValue.CreateFromTensor("text_ids", textIdsTensor), + NamedOnnxValue.CreateFromTensor("style_dp", styleDpTensor), + NamedOnnxValue.CreateFromTensor("text_mask", textMaskTensor) + }; + using (var dpOutputs = _dpOrt.Run(dpInputs)) + { + var durOnnx = dpOutputs.First(o => o.Name == "duration").AsTensor().ToArray(); + + // Apply speed factor to duration + for (int i = 0; i < durOnnx.Length; i++) + { + durOnnx[i] /= speed; + } + + var textEncInputs = new List + { + NamedOnnxValue.CreateFromTensor("text_ids", textIdsTensor), + NamedOnnxValue.CreateFromTensor("style_ttl", styleTtlTensor), + NamedOnnxValue.CreateFromTensor("text_mask", textMaskTensor) + }; + using (var textEncOutputs = _textEncOrt.Run(textEncInputs)) + { + var textEmbTensor = textEncOutputs.First(o => o.Name == "text_emb").AsTensor(); + // Sample noisy latent + var (xt, latentMask) = SampleNoisyLatent(durOnnx); + var latentShape = new long[] { bsz, xt[0].Length, xt[0][0].Length }; + var latentMaskShape = new long[] { bsz, 1, latentMask[0][0].Length }; + + var totalStepArray = Enumerable.Repeat((float)totalStep, bsz).ToArray(); + + // Iterative denoising + for (int step = 0; step < totalStep; step++) + { + var currentStepArray = Enumerable.Repeat((float)step, bsz).ToArray(); + + var vectorEstInputs = new List + { + NamedOnnxValue.CreateFromTensor("noisy_latent", Helper.ArrayToTensor(xt, latentShape)), + NamedOnnxValue.CreateFromTensor("text_emb", textEmbTensor), + NamedOnnxValue.CreateFromTensor("style_ttl", styleTtlTensor), + NamedOnnxValue.CreateFromTensor("text_mask", textMaskTensor), + NamedOnnxValue.CreateFromTensor("latent_mask", Helper.ArrayToTensor(latentMask, latentMaskShape)), + NamedOnnxValue.CreateFromTensor("total_step", new DenseTensor(totalStepArray, new int[] { bsz })), + NamedOnnxValue.CreateFromTensor("current_step", new DenseTensor(currentStepArray, new int[] { bsz })) + }; + + using (var vectorEstOutputs = _vectorEstOrt.Run(vectorEstInputs)) + { + var denoisedLatent = vectorEstOutputs.First(o => o.Name == "denoised_latent").AsTensor(); + + // Update xt + int idx = 0; + for (int b = 0; b < bsz; b++) + { + for (int d = 0; d < xt[b].Length; d++) + { + for (int t = 0; t < xt[b][d].Length; t++) + { + xt[b][d][t] = denoisedLatent.GetValue(idx++); + } + } + } + } + + } + + // Run vocoder + var vocoderInputs = new List + { + NamedOnnxValue.CreateFromTensor("latent", Helper.ArrayToTensor(xt, latentShape)) + }; + using (var vocoderOutputs = _vocoderOrt.Run(vocoderInputs)) + { + var wavTensor = vocoderOutputs.First(o => o.Name == "wav_tts").AsTensor(); + + return (wavTensor.ToArray(), durOnnx); + } + + + } + } + + + // Run text encoder + + + + + } + + public (float[] wav, float[] duration) Call(string text, string lang, Style style, int totalStep, float speed = 1.05f, float silenceDuration = 0.3f) + { + if (style.TtlShape[0] != 1) + { + throw new ArgumentException("Single speaker text to speech only supports single style"); + } + + int maxLen = lang == "ko" ? 120 : 300; + var textList = Helper.ChunkText(text, maxLen); + var wavCat = new List(); + float durCat = 0.0f; + + foreach (var chunk in textList) + { + var (wav, duration) = _Infer(new List { chunk }, new List { lang }, style, totalStep, speed); + + if (wavCat.Count == 0) + { + wavCat.AddRange(wav); + durCat = duration[0]; + } + else + { + int silenceLen = (int)(silenceDuration * SampleRate); + var silence = new float[silenceLen]; + wavCat.AddRange(silence); + wavCat.AddRange(wav); + durCat += duration[0] + silenceDuration; + } + } + + return (wavCat.ToArray(), new float[] { durCat }); + } + + public (float[] wav, float[] duration) Batch(List textList, List langList, Style style, int totalStep, float speed = 1.05f) + { + return _Infer(textList, langList, style, totalStep, speed); + } + } + + // ============================================================================ + // Helper class with utility functions + // ============================================================================ + + public static class Helper + { + // ============================================================================ + // Utility functions + // ============================================================================ + + public static float[][][] LengthToMask(long[] lengths, long maxLen = -1) + { + if (maxLen == -1) + { + maxLen = lengths.Max(); + } + + var mask = new float[lengths.Length][][]; + for (int i = 0; i < lengths.Length; i++) + { + mask[i] = new float[1][]; + mask[i][0] = new float[maxLen]; + for (int j = 0; j < maxLen; j++) + { + mask[i][0][j] = j < lengths[i] ? 1.0f : 0.0f; + } + } + return mask; + } + + public static float[][][] GetLatentMask(long[] wavLengths, int baseChunkSize, int chunkCompressFactor) + { + int latentSize = baseChunkSize * chunkCompressFactor; + var latentLengths = wavLengths.Select(len => (len + latentSize - 1) / latentSize).ToArray(); + return LengthToMask(latentLengths); + } + + // ============================================================================ + // ONNX model loading + // ============================================================================ + + public static InferenceSession LoadOnnx(string onnxPath, SessionOptions opts) + { + return new InferenceSession(onnxPath, opts); + } + + public static (InferenceSession dp, InferenceSession textEnc, InferenceSession vectorEst, InferenceSession vocoder) + LoadOnnxAll(string onnxDir, SessionOptions opts) + { + var dpPath = Path.Combine(onnxDir, "duration_predictor.onnx"); + var textEncPath = Path.Combine(onnxDir, "text_encoder.onnx"); + var vectorEstPath = Path.Combine(onnxDir, "vector_estimator.onnx"); + var vocoderPath = Path.Combine(onnxDir, "vocoder.onnx"); + + return ( + LoadOnnx(dpPath, opts), + LoadOnnx(textEncPath, opts), + LoadOnnx(vectorEstPath, opts), + LoadOnnx(vocoderPath, opts) + ); + } + + // ============================================================================ + // Configuration loading + // ============================================================================ + + public static Config LoadCfgs(string onnxDir) + { + var cfgPath = Path.Combine(onnxDir, "tts.json"); + var json = File.ReadAllText(cfgPath); + + using (var doc = JsonDocument.Parse(json)) + { + var root = doc.RootElement; + + return new Config + { + AE = new Config.AEConfig + { + SampleRate = root.GetProperty("ae").GetProperty("sample_rate").GetInt32(), + BaseChunkSize = root.GetProperty("ae").GetProperty("base_chunk_size").GetInt32() + }, + TTL = new Config.TTLConfig + { + ChunkCompressFactor = root.GetProperty("ttl").GetProperty("chunk_compress_factor").GetInt32(), + LatentDim = root.GetProperty("ttl").GetProperty("latent_dim").GetInt32() + } + }; + } + + } + + public static UnicodeProcessor LoadTextProcessor(string onnxDir) + { + var unicodeIndexerPath = Path.Combine(onnxDir, "unicode_indexer.json"); + return new UnicodeProcessor(unicodeIndexerPath); + } + + // ============================================================================ + // Voice style loading + // ============================================================================ + + public static Style LoadVoiceStyle(List voiceStylePaths, bool verbose = false) + { + int bsz = voiceStylePaths.Count; + + // Read first file to get dimensions + var firstJson = File.ReadAllText(voiceStylePaths[0]); + using (var firstDoc = JsonDocument.Parse(firstJson)) + { + var firstRoot = firstDoc.RootElement; + + var ttlDims = ParseInt64Array(firstRoot.GetProperty("style_ttl").GetProperty("dims")); + var dpDims = ParseInt64Array(firstRoot.GetProperty("style_dp").GetProperty("dims")); + + long ttlDim1 = ttlDims[1]; + long ttlDim2 = ttlDims[2]; + long dpDim1 = dpDims[1]; + long dpDim2 = dpDims[2]; + + // Pre-allocate arrays with full batch size + int ttlSize = (int)(bsz * ttlDim1 * ttlDim2); + int dpSize = (int)(bsz * dpDim1 * dpDim2); + var ttlFlat = new float[ttlSize]; + var dpFlat = new float[dpSize]; + + // Fill in the data + for (int i = 0; i < bsz; i++) + { + var json = File.ReadAllText(voiceStylePaths[i]); + using (var doc = JsonDocument.Parse(json)) + { + var root = doc.RootElement; + + // Flatten data + var ttlData3D = ParseFloat3DArray(root.GetProperty("style_ttl").GetProperty("data")); + var ttlDataFlat = new List(); + foreach (var batch in ttlData3D) + { + foreach (var row in batch) + { + ttlDataFlat.AddRange(row); + } + } + + var dpData3D = ParseFloat3DArray(root.GetProperty("style_dp").GetProperty("data")); + var dpDataFlat = new List(); + foreach (var batch in dpData3D) + { + foreach (var row in batch) + { + dpDataFlat.AddRange(row); + } + } + + // Copy to pre-allocated array + int ttlOffset = (int)(i * ttlDim1 * ttlDim2); + ttlDataFlat.CopyTo(ttlFlat, ttlOffset); + + int dpOffset = (int)(i * dpDim1 * dpDim2); + dpDataFlat.CopyTo(dpFlat, dpOffset); + } + + } + + var ttlShape = new long[] { bsz, ttlDim1, ttlDim2 }; + var dpShape = new long[] { bsz, dpDim1, dpDim2 }; + + if (verbose) + { + Console.WriteLine($"Loaded {bsz} voice styles"); + } + + return new Style(ttlFlat, ttlShape, dpFlat, dpShape); + } + + } + + private static float[][][] ParseFloat3DArray(JsonElement element) + { + var result = new List(); + foreach (var batch in element.EnumerateArray()) + { + var batch2D = new List(); + foreach (var row in batch.EnumerateArray()) + { + var rowData = new List(); + foreach (var val in row.EnumerateArray()) + { + rowData.Add(val.GetSingle()); + } + batch2D.Add(rowData.ToArray()); + } + result.Add(batch2D.ToArray()); + } + return result.ToArray(); + } + + private static long[] ParseInt64Array(JsonElement element) + { + var result = new List(); + foreach (var val in element.EnumerateArray()) + { + result.Add(val.GetInt64()); + } + return result.ToArray(); + } + + // ============================================================================ + // TextToSpeech loading + // ============================================================================ + + public static TextToSpeech LoadTextToSpeech(string onnxDir, bool useGpu = false) + { + var opts = new SessionOptions(); + if (useGpu) + { + throw new NotImplementedException("GPU mode is not supported yet"); + } + else + { + Console.WriteLine("Using CPU for inference"); + } + + var cfgs = LoadCfgs(onnxDir); + var (dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) = LoadOnnxAll(onnxDir, opts); + var textProcessor = LoadTextProcessor(onnxDir); + + return new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt); + } + + // ============================================================================ + // WAV file writing + // ============================================================================ + + public static void WriteWavFile(string filename, float[] audioData, int sampleRate) + { + using (var writer = new BinaryWriter(File.Open(filename, FileMode.Create))) + { + int numChannels = 1; + int bitsPerSample = 16; + int byteRate = sampleRate * numChannels * bitsPerSample / 8; + short blockAlign = (short)(numChannels * bitsPerSample / 8); + int dataSize = audioData.Length * bitsPerSample / 8; + + // RIFF header + writer.Write(Encoding.ASCII.GetBytes("RIFF")); + writer.Write(36 + dataSize); + writer.Write(Encoding.ASCII.GetBytes("WAVE")); + + // fmt chunk + writer.Write(Encoding.ASCII.GetBytes("fmt ")); + writer.Write(16); // fmt chunk size + writer.Write((short)1); // audio format (PCM) + writer.Write((short)numChannels); + writer.Write(sampleRate); + writer.Write(byteRate); + writer.Write(blockAlign); + writer.Write((short)bitsPerSample); + + // data chunk + writer.Write(Encoding.ASCII.GetBytes("data")); + writer.Write(dataSize); + + // Write audio data + foreach (var sample in audioData) + { + float clamped = Math.Max(-1.0f, Math.Min(1.0f, sample)); + short intSample = (short)(clamped * 32767); + writer.Write(intSample); + } + } + + + } + + // ============================================================================ + // Tensor conversion utilities + // ============================================================================ + + public static DenseTensor ArrayToTensor(float[][][] array, long[] dims) + { + var flat = new List(); + foreach (var batch in array) + { + foreach (var row in batch) + { + flat.AddRange(row); + } + } + return new DenseTensor(flat.ToArray(), dims.Select(x => (int)x).ToArray()); + } + + public static DenseTensor IntArrayToTensor(long[][] array, long[] dims) + { + var flat = new List(); + foreach (var row in array) + { + flat.AddRange(row); + } + return new DenseTensor(flat.ToArray(), dims.Select(x => (int)x).ToArray()); + } + + // ============================================================================ + // Timer utility + // ============================================================================ + + public static T Timer(string name, Func func) + { + var start = DateTime.Now; + Console.WriteLine($"{name}..."); + var result = func(); + var elapsed = (DateTime.Now - start).TotalSeconds; + Console.WriteLine($" -> {name} completed in {elapsed:F2} sec"); + return result; + } + + public static string SanitizeFilename(string text, int maxLen) + { + var result = new StringBuilder(); + int count = 0; + foreach (char c in text) + { + if (count >= maxLen) break; + if (char.IsLetterOrDigit(c)) + { + result.Append(c); + } + else + { + result.Append('_'); + } + count++; + } + return result.ToString(); + } + + // ============================================================================ + // Chunk text + // ============================================================================ + + public static List ChunkText(string text, int maxLen = 300) + { + var chunks = new List(); + + // Split by paragraph (two or more newlines) + var paragraphRegex = new Regex(@"\n\s*\n+"); + var paragraphs = paragraphRegex.Split(text.Trim()) + .Select(p => p.Trim()) + .Where(p => !string.IsNullOrEmpty(p)) + .ToList(); + + // Split by sentence boundaries, excluding abbreviations + var sentenceRegex = new Regex(@"(? + /// The main entry point for the application. + /// + [STAThread] + static void Main() + { + Application.EnableVisualStyles(); + Application.SetCompatibleTextRenderingDefault(false); + Application.Run(new fMain()); + } + } +} diff --git a/HMI/SubProject/tts/Properties/AssemblyInfo.cs b/HMI/SubProject/tts/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..10e3f4f --- /dev/null +++ b/HMI/SubProject/tts/Properties/AssemblyInfo.cs @@ -0,0 +1,17 @@ +using System.Reflection; +using System.Runtime.InteropServices; + +[assembly: AssemblyTitle("Supertonic.WinForms")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("")] +[assembly: AssemblyProduct("Supertonic.WinForms")] +[assembly: AssemblyCopyright("Copyright © 2026")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +[assembly: ComVisible(false)] +[assembly: Guid("bd3e8373-c40c-4f7f-aa18-6990f1cfd21a")] + +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/HMI/SubProject/tts/Properties/Resources.Designer.cs b/HMI/SubProject/tts/Properties/Resources.Designer.cs new file mode 100644 index 0000000..87457d8 --- /dev/null +++ b/HMI/SubProject/tts/Properties/Resources.Designer.cs @@ -0,0 +1,63 @@ +//------------------------------------------------------------------------------ +// +// 이 코드는 도구를 사용하여 생성되었습니다. +// 런타임 버전:4.0.30319.42000 +// +// 파일 내용을 변경하면 잘못된 동작이 발생할 수 있으며, 코드를 다시 생성하면 +// 이러한 변경 내용이 손실됩니다. +// +//------------------------------------------------------------------------------ + +namespace Supertonic.WinForms.Properties { + using System; + + + /// + /// 지역화된 문자열 등을 찾기 위한 강력한 형식의 리소스 클래스입니다. + /// + // 이 클래스는 ResGen 또는 Visual Studio와 같은 도구를 통해 StronglyTypedResourceBuilder + // 클래스에서 자동으로 생성되었습니다. + // 멤버를 추가하거나 제거하려면 .ResX 파일을 편집한 다음 /str 옵션을 사용하여 ResGen을 + // 다시 실행하거나 VS 프로젝트를 다시 빌드하십시오. + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")] + [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] + [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] + internal class Resources { + + private static global::System.Resources.ResourceManager resourceMan; + + private static global::System.Globalization.CultureInfo resourceCulture; + + [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] + internal Resources() { + } + + /// + /// 이 클래스에서 사용하는 캐시된 ResourceManager 인스턴스를 반환합니다. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Resources.ResourceManager ResourceManager { + get { + if (object.ReferenceEquals(resourceMan, null)) { + global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("Supertonic.WinForms.Properties.Resources", typeof(Resources).Assembly); + resourceMan = temp; + } + return resourceMan; + } + } + + /// + /// 이 강력한 형식의 리소스 클래스를 사용하여 모든 리소스 조회에 대한 현재 스레드의 CurrentUICulture + /// 속성을 재정의합니다. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Globalization.CultureInfo Culture { + get { + return resourceCulture; + } + set { + resourceCulture = value; + } + } + } +} diff --git a/HMI/SubProject/tts/Properties/Resources.resx b/HMI/SubProject/tts/Properties/Resources.resx new file mode 100644 index 0000000..4fdb1b6 --- /dev/null +++ b/HMI/SubProject/tts/Properties/Resources.resx @@ -0,0 +1,101 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + text/microsoft-resx + + + 1.3 + + + System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.3500.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.3500.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + \ No newline at end of file diff --git a/HMI/SubProject/tts/Properties/Settings.Designer.cs b/HMI/SubProject/tts/Properties/Settings.Designer.cs new file mode 100644 index 0000000..405de7e --- /dev/null +++ b/HMI/SubProject/tts/Properties/Settings.Designer.cs @@ -0,0 +1,26 @@ +//------------------------------------------------------------------------------ +// +// 이 코드는 도구를 사용하여 생성되었습니다. +// 런타임 버전:4.0.30319.42000 +// +// 파일 내용을 변경하면 잘못된 동작이 발생할 수 있으며, 코드를 다시 생성하면 +// 이러한 변경 내용이 손실됩니다. +// +//------------------------------------------------------------------------------ + +namespace Supertonic.WinForms.Properties { + + + [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "15.9.0.0")] + internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase { + + private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings()))); + + public static Settings Default { + get { + return defaultInstance; + } + } + } +} diff --git a/HMI/SubProject/tts/Properties/Settings.settings b/HMI/SubProject/tts/Properties/Settings.settings new file mode 100644 index 0000000..049245f --- /dev/null +++ b/HMI/SubProject/tts/Properties/Settings.settings @@ -0,0 +1,6 @@ + + + + + + diff --git a/HMI/SubProject/tts/Supertonic.WinForms.csproj b/HMI/SubProject/tts/Supertonic.WinForms.csproj new file mode 100644 index 0000000..7329d16 --- /dev/null +++ b/HMI/SubProject/tts/Supertonic.WinForms.csproj @@ -0,0 +1,91 @@ + + + + + Debug + AnyCPU + {BD3E8373-C40C-4F7F-AA18-6990F1CFD21A} + WinExe + Supertonic.WinForms + Supertonic.WinForms + v4.8 + 512 + true + true + + + x64 + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + AnyCPU + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + bin\Debug\ + + + + + + + + + + + + + + + + + Form + + + fMain.cs + + + + + + True + True + Resources.resx + + + True + True + Settings.settings + + + fMain.cs + + + ResXFileCodeGenerator + Resources.Designer.cs + + + + SettingsSingleFileGenerator + Settings.Designer.cs + + + 1.24.1 + + + 10.0.2 + + + + \ No newline at end of file diff --git a/HMI/SubProject/tts/Supertonic.WinForms.sln b/HMI/SubProject/tts/Supertonic.WinForms.sln new file mode 100644 index 0000000..8b7fc0d --- /dev/null +++ b/HMI/SubProject/tts/Supertonic.WinForms.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Express 15 for Windows Desktop +VisualStudioVersion = 15.0.36324.19 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Supertonic.WinForms", "Supertonic.WinForms.csproj", "{BD3E8373-C40C-4F7F-AA18-6990F1CFD21A}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BD3E8373-C40C-4F7F-AA18-6990F1CFD21A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BD3E8373-C40C-4F7F-AA18-6990F1CFD21A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BD3E8373-C40C-4F7F-AA18-6990F1CFD21A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BD3E8373-C40C-4F7F-AA18-6990F1CFD21A}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {1DFF2850-1C17-454F-BB29-E08B604073C4} + EndGlobalSection +EndGlobal diff --git a/HMI/SubProject/tts/fMain.Designer.cs b/HMI/SubProject/tts/fMain.Designer.cs new file mode 100644 index 0000000..dcb6541 --- /dev/null +++ b/HMI/SubProject/tts/fMain.Designer.cs @@ -0,0 +1,187 @@ +namespace Supertonic.WinForms +{ + partial class fMain + { + private System.ComponentModel.IContainer components = null; + + protected override void Dispose(bool disposing) + { + if (disposing && (components != null)) + { + components.Dispose(); + } + base.Dispose(disposing); + } + + #region Windows Form Designer generated code + + private void InitializeComponent() + { + this.txtInput = new System.Windows.Forms.TextBox(); + this.btnGenerate = new System.Windows.Forms.Button(); + this.cmbLang = new System.Windows.Forms.ComboBox(); + this.txtStylePath = new System.Windows.Forms.TextBox(); + this.numSteps = new System.Windows.Forms.NumericUpDown(); + this.numSpeed = new System.Windows.Forms.NumericUpDown(); + this.lblText = new System.Windows.Forms.Label(); + this.lblLang = new System.Windows.Forms.Label(); + this.lblStyle = new System.Windows.Forms.Label(); + this.lblSteps = new System.Windows.Forms.Label(); + this.lblSpeed = new System.Windows.Forms.Label(); + this.txtLog = new System.Windows.Forms.TextBox(); + ((System.ComponentModel.ISupportInitialize)(this.numSteps)).BeginInit(); + ((System.ComponentModel.ISupportInitialize)(this.numSpeed)).BeginInit(); + this.SuspendLayout(); + // + // txtInput + // + this.txtInput.Location = new System.Drawing.Point(12, 29); + this.txtInput.Multiline = true; + this.txtInput.Name = "txtInput"; + this.txtInput.Size = new System.Drawing.Size(460, 60); + this.txtInput.TabIndex = 0; + this.txtInput.Text = "This morning, I took a walk in the park."; + // + // btnGenerate + // + this.btnGenerate.Location = new System.Drawing.Point(372, 169); + this.btnGenerate.Name = "btnGenerate"; + this.btnGenerate.Size = new System.Drawing.Size(100, 30); + this.btnGenerate.TabIndex = 1; + this.btnGenerate.Text = "Generate TTS"; + this.btnGenerate.UseVisualStyleBackColor = true; + this.btnGenerate.Click += new System.EventHandler(this.btnGenerate_Click); + // + // cmbLang + // + this.cmbLang.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList; + this.cmbLang.FormattingEnabled = true; + this.cmbLang.Location = new System.Drawing.Point(12, 114); + this.cmbLang.Name = "cmbLang"; + this.cmbLang.Size = new System.Drawing.Size(80, 21); + this.cmbLang.TabIndex = 2; + // + // txtStylePath + // + this.txtStylePath.Location = new System.Drawing.Point(110, 114); + this.txtStylePath.Name = "txtStylePath"; + this.txtStylePath.Size = new System.Drawing.Size(362, 20); + this.txtStylePath.TabIndex = 3; + this.txtStylePath.Text = "assets/voice_styles/M1.json"; + // + // numSteps + // + this.numSteps.Location = new System.Drawing.Point(12, 169); + this.numSteps.Name = "numSteps"; + this.numSteps.Size = new System.Drawing.Size(80, 20); + this.numSteps.TabIndex = 4; + this.numSteps.Value = new decimal(new int[] { 5, 0, 0, 0 }); + // + // numSpeed + // + this.numSpeed.DecimalPlaces = 2; + this.numSpeed.Increment = new decimal(new int[] { 5, 0, 0, 131072 }); + this.numSpeed.Location = new System.Drawing.Point(110, 169); + this.numSpeed.Name = "numSpeed"; + this.numSpeed.Size = new System.Drawing.Size(80, 20); + this.numSpeed.TabIndex = 5; + this.numSpeed.Value = new decimal(new int[] { 105, 0, 0, 131072 }); + // + // lblText + // + this.lblText.AutoSize = true; + this.lblText.Location = new System.Drawing.Point(12, 13); + this.lblText.Name = "lblText"; + this.lblText.Size = new System.Drawing.Size(28, 13); + this.lblText.TabIndex = 6; + this.lblText.Text = "Text"; + // + // lblLang + // + this.lblLang.AutoSize = true; + this.lblLang.Location = new System.Drawing.Point(12, 98); + this.lblLang.Name = "lblLang"; + this.lblLang.Size = new System.Drawing.Size(55, 13); + this.lblLang.TabIndex = 7; + this.lblLang.Text = "Language"; + // + // lblStyle + // + this.lblStyle.AutoSize = true; + this.lblStyle.Location = new System.Drawing.Point(110, 98); + this.lblStyle.Name = "lblStyle"; + this.lblStyle.Size = new System.Drawing.Size(87, 13); + this.lblStyle.TabIndex = 8; + this.lblStyle.Text = "Voice Style Path"; + // + // lblSteps + // + this.lblSteps.AutoSize = true; + this.lblSteps.Location = new System.Drawing.Point(12, 153); + this.lblSteps.Name = "lblSteps"; + this.lblSteps.Size = new System.Drawing.Size(61, 13); + this.lblSteps.TabIndex = 9; + this.lblSteps.Text = "Total Steps"; + // + // lblSpeed + // + this.lblSpeed.AutoSize = true; + this.lblSpeed.Location = new System.Drawing.Point(110, 153); + this.lblSpeed.Name = "lblSpeed"; + this.lblSpeed.Size = new System.Drawing.Size(38, 13); + this.lblSpeed.TabIndex = 10; + this.lblSpeed.Text = "Speed"; + // + // txtLog + // + this.txtLog.Location = new System.Drawing.Point(12, 214); + this.txtLog.Multiline = true; + this.txtLog.Name = "txtLog"; + this.txtLog.ReadOnly = true; + this.txtLog.ScrollBars = System.Windows.Forms.ScrollBars.Vertical; + this.txtLog.Size = new System.Drawing.Size(460, 150); + this.txtLog.TabIndex = 11; + // + // fMain + // + this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); + this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; + this.ClientSize = new System.Drawing.Size(484, 376); + this.Controls.Add(this.txtLog); + this.Controls.Add(this.lblSpeed); + this.Controls.Add(this.lblSteps); + this.Controls.Add(this.lblStyle); + this.Controls.Add(this.lblLang); + this.Controls.Add(this.lblText); + this.Controls.Add(this.numSpeed); + this.Controls.Add(this.numSteps); + this.Controls.Add(this.txtStylePath); + this.Controls.Add(this.cmbLang); + this.Controls.Add(this.btnGenerate); + this.Controls.Add(this.txtInput); + this.Name = "fMain"; + this.Text = "Supertonic TTS (WinForms 4.8)"; + this.Load += new System.EventHandler(this.fMain_Load); + ((System.ComponentModel.ISupportInitialize)(this.numSteps)).EndInit(); + ((System.ComponentModel.ISupportInitialize)(this.numSpeed)).EndInit(); + this.ResumeLayout(false); + this.PerformLayout(); + + } + + #endregion + + private System.Windows.Forms.TextBox txtInput; + private System.Windows.Forms.Button btnGenerate; + private System.Windows.Forms.ComboBox cmbLang; + private System.Windows.Forms.TextBox txtStylePath; + private System.Windows.Forms.NumericUpDown numSteps; + private System.Windows.Forms.NumericUpDown numSpeed; + private System.Windows.Forms.Label lblText; + private System.Windows.Forms.Label lblLang; + private System.Windows.Forms.Label lblStyle; + private System.Windows.Forms.Label lblSteps; + private System.Windows.Forms.Label lblSpeed; + private System.Windows.Forms.TextBox txtLog; + } +} diff --git a/HMI/SubProject/tts/fMain.cs b/HMI/SubProject/tts/fMain.cs new file mode 100644 index 0000000..b14c7d2 --- /dev/null +++ b/HMI/SubProject/tts/fMain.cs @@ -0,0 +1,89 @@ +using System; +using System.Collections.Generic; +using System.Drawing; +using System.IO; +using System.Linq; +using System.Windows.Forms; + +namespace Supertonic.WinForms +{ + public partial class fMain : Form + { + private TextToSpeech _tts; + + public fMain() + { + InitializeComponent(); + } + + private async void btnGenerate_Click(object sender, EventArgs e) + { + try + { + string text = txtInput.Text; + string lang = cmbLang.SelectedItem?.ToString() ?? "en"; + string stylePath = txtStylePath.Text; + int totalStep = (int)numSteps.Value; + float speed = (float)numSpeed.Value; + + if (string.IsNullOrWhiteSpace(text)) + { + MessageBox.Show("Please enter text."); + return; + } + + if (_tts == null) + { + Log("Loading TTS model..."); + string onnxDir = "assets/onnx"; // This should be updated if assets are moved + _tts = await System.Threading.Tasks.Task.Run(() => Helper.LoadTextToSpeech(onnxDir, false)); + Log("TTS model loaded."); + } + + Log($"Generating speech: \"{text}\" ({lang})"); + + var style = Helper.LoadVoiceStyle(new List { stylePath }, true); + + var result = await System.Threading.Tasks.Task.Run(() => _tts.Call(text, lang, style, totalStep, speed)); + + string saveDir = "results"; + if (!Directory.Exists(saveDir)) Directory.CreateDirectory(saveDir); + + string fname = $"{Helper.SanitizeFilename(text, 20)}_{DateTime.Now:HHmmss}.wav"; + string outputPath = Path.Combine(saveDir, fname); + + Helper.WriteWavFile(outputPath, result.wav, _tts.SampleRate); + Log($"Saved: {outputPath}"); + + MessageBox.Show($"Synthesis completed successfully!\nSaved to: {outputPath}"); + } + catch (Exception ex) + { + Log($"Error: {ex.Message}"); + MessageBox.Show($"Error: {ex.Message}"); + } + } + + private void Log(string msg) + { + if (txtLog.InvokeRequired) + { + txtLog.Invoke(new Action(() => Log(msg))); + return; + } + txtLog.AppendText($"[{DateTime.Now:HH:mm:ss}] {msg}\r\n"); + txtLog.SelectionStart = txtLog.Text.Length; + txtLog.ScrollToCaret(); + } + + private void fMain_Load(object sender, EventArgs e) + { + cmbLang.Items.AddRange(Languages.Available); + cmbLang.SelectedIndex = 0; + + // Set default style path if exists + string defaultStyle = "assets/voice_styles/M1.json"; + if (File.Exists(defaultStyle)) txtStylePath.Text = defaultStyle; + } + } +} diff --git a/HMI/SubProject/tts/fMain.resx b/HMI/SubProject/tts/fMain.resx new file mode 100644 index 0000000..2a981b1 --- /dev/null +++ b/HMI/SubProject/tts/fMain.resx @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + text/microsoft-resx + + + 2.0 + + + System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + +