add supertonic

This commit is contained in:
backuppc
2025-12-10 11:39:15 +09:00
parent 3695ab0044
commit 868fa2deec
22 changed files with 54223 additions and 1 deletions

View File

@@ -83,6 +83,75 @@ namespace AGVNavigationCore.Controls
// UI 정보 그리기 (변환 없이) // UI 정보 그리기 (변환 없이)
if (_showGrid) if (_showGrid)
DrawUIInfo(g); DrawUIInfo(g);
// 동기화 화면 그리기 (변환 없이, 최상위)
if (_canvasMode == CanvasMode.Sync)
{
DrawSyncScreen(g);
}
}
private void DrawSyncScreen(Graphics g)
{
// 반투명 검은색 배경
using (var brush = new SolidBrush(Color.FromArgb(200, 0, 0, 0)))
{
g.FillRectangle(brush, this.ClientRectangle);
}
// 중앙에 메시지 표시
var center = new Point(Width / 2, Height / 2);
// 메시지 폰트
using (var fontTitle = new Font("Malgun Gothic", 24, FontStyle.Bold))
using (var fontDetail = new Font("Malgun Gothic", 14))
using (var brushText = new SolidBrush(Color.White))
{
// 메인 메시지
var sizeTitle = g.MeasureString(_syncMessage, fontTitle);
g.DrawString(_syncMessage, fontTitle, brushText,
center.X - sizeTitle.Width / 2,
center.Y - sizeTitle.Height / 2 - 60);
// 진행률 바 배경
int barWidth = 500;
int barHeight = 30;
int barX = center.X - barWidth / 2;
int barY = center.Y + 10;
using (var brushBarBg = new SolidBrush(Color.FromArgb(64, 64, 64)))
{
g.FillRectangle(brushBarBg, barX, barY, barWidth, barHeight);
}
g.DrawRectangle(Pens.Gray, barX, barY, barWidth, barHeight);
// 진행률 바 채우기
if (_syncProgress > 0)
{
using (var brushProgress = new SolidBrush(Color.LimeGreen))
{
int fillWidth = (int)((barWidth - 4) * _syncProgress);
if (fillWidth > 0)
g.FillRectangle(brushProgress, barX + 2, barY + 2, fillWidth, barHeight - 4);
}
}
// 진행률 텍스트
string progressText = $"{(_syncProgress * 100):F0}%";
var sizeProgress = g.MeasureString(progressText, fontDetail);
g.DrawString(progressText, fontDetail, brushText,
center.X - sizeProgress.Width / 2,
barY + 5);
// 상세 메시지
if (!string.IsNullOrEmpty(_syncDetail))
{
var sizeDetail = g.MeasureString(_syncDetail, fontDetail);
g.DrawString(_syncDetail, fontDetail, brushText,
center.X - sizeDetail.Width / 2,
barY + barHeight + 20);
}
}
} }
private void DrawGrid(Graphics g) private void DrawGrid(Graphics g)

View File

@@ -35,7 +35,8 @@ namespace AGVNavigationCore.Controls
/// </summary> /// </summary>
public enum CanvasMode public enum CanvasMode
{ {
Edit // 편집 가능 (맵 에디터) Edit, // 편집 가능 (맵 에디터)
Sync // 동기화 모드 (장비 설정 동기화)
} }
/// <summary> /// <summary>
@@ -116,6 +117,11 @@ namespace AGVNavigationCore.Controls
// RFID 중복 검사 // RFID 중복 검사
private HashSet<string> _duplicateRfidNodes = new HashSet<string>(); private HashSet<string> _duplicateRfidNodes = new HashSet<string>();
// 동기화 모드 관련
private string _syncMessage = "동기화 중...";
private float _syncProgress = 0.0f;
private string _syncDetail = "";
// 브러쉬 및 펜 // 브러쉬 및 펜
private Brush _normalNodeBrush; private Brush _normalNodeBrush;
private Brush _rotationNodeBrush; private Brush _rotationNodeBrush;
@@ -546,6 +552,40 @@ namespace AGVNavigationCore.Controls
} }
} }
/// <summary>
/// 동기화 상태 설정
/// </summary>
/// <param name="message">메인 메시지</param>
/// <param name="progress">진행률 (0.0 ~ 1.0)</param>
/// <param name="detail">상세 메시지</param>
public void SetSyncStatus(string message, float progress, string detail = "")
{
_syncMessage = message;
_syncProgress = Math.Max(0.0f, Math.Min(1.0f, progress));
_syncDetail = detail;
if (_canvasMode != CanvasMode.Sync)
{
_canvasMode = CanvasMode.Sync;
UpdateModeUI();
}
Invalidate();
}
/// <summary>
/// 동기화 모드 종료
/// </summary>
public void ExitSyncMode()
{
if (_canvasMode == CanvasMode.Sync)
{
_canvasMode = CanvasMode.Edit; // 기본 모드로 복귀 (또는 이전 모드)
UpdateModeUI();
Invalidate();
}
}
#endregion #endregion
#region Cleanup #region Cleanup

View File

@@ -107,6 +107,13 @@ namespace Project
var item = synlist.ElementAt(synidx); var item = synlist.ElementAt(synidx);
UpdateProgressStatus(stepTime.TotalSeconds, 5, $"SYNC :{item.Key}"); UpdateProgressStatus(stepTime.TotalSeconds, 5, $"SYNC :{item.Key}");
PUB.AGV.AGVCommand(item.Key, item.Value); PUB.AGV.AGVCommand(item.Key, item.Value);
// 캔버스에 동기화 상태 표시
if (PUB._mapCanvas != null)
{
float progress = (float)synidx / synlist.Count;
PUB._mapCanvas.SetSyncStatus("장비 설정 동기화 중...", progress, $"항목: {item.Key} ({synidx + 1}/{synlist.Count})");
}
} }
LastCommandTime = DateTime.Now; LastCommandTime = DateTime.Now;
PUB.sm.UpdateRunStepSeq(); PUB.sm.UpdateRunStepSeq();
@@ -143,6 +150,11 @@ namespace Project
{ {
PUB.AddEEDB($"SYNC완료({PUB.Result.TargetPos})"); PUB.AddEEDB($"SYNC완료({PUB.Result.TargetPos})");
UpdateProgressStatus(stepTime.TotalSeconds, 5, "SYNC : 완료"); UpdateProgressStatus(stepTime.TotalSeconds, 5, "SYNC : 완료");
// 동기화 완료 시 캔버스 모드 복귀
if (PUB._mapCanvas != null)
PUB._mapCanvas.SetSyncStatus("동기화 완료!", 1.0f, "잠시 후 메인 화면으로 이동합니다.");
LastCommandTime = DateTime.Now; LastCommandTime = DateTime.Now;
PUB.sm.UpdateRunStepSeq(); PUB.sm.UpdateRunStepSeq();
return false; return false;

View File

@@ -156,6 +156,10 @@ namespace Project
// 장치 관리 태스크 시작 (IDLE 진입 시 한 번만) // 장치 관리 태스크 시작 (IDLE 진입 시 한 번만)
StartDeviceManagementTask(); StartDeviceManagementTask();
// 동기화 모드 종료 (혹시 남아있을 경우)
if (PUB._mapCanvas != null)
PUB._mapCanvas.ExitSyncMode();
} }
//자동소거기능 //자동소거기능

View File

@@ -0,0 +1,135 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace Supertonic
{
class Program
{
class Args
{
public bool UseGpu { get; set; } = false;
public string OnnxDir { get; set; } = "assets/onnx";
public int TotalStep { get; set; } = 5;
public float Speed { get; set; } = 1.05f;
public int NTest { get; set; } = 4;
public List<string> VoiceStyle { get; set; } = new List<string> { "assets/voice_styles/M1.json" };
public List<string> Text { get; set; } = new List<string>
{
"This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
};
public string SaveDir { get; set; } = "results";
public bool Batch { get; set; } = false;
}
static Args ParseArgs(string[] args)
{
var result = new Args();
for (int i = 0; i < args.Length; i++)
{
switch (args[i])
{
case "--use-gpu":
result.UseGpu = true;
break;
case "--batch":
result.Batch = true;
break;
case "--onnx-dir" when i + 1 < args.Length:
result.OnnxDir = args[++i];
break;
case "--total-step" when i + 1 < args.Length:
result.TotalStep = int.Parse(args[++i]);
break;
case "--speed" when i + 1 < args.Length:
result.Speed = float.Parse(args[++i]);
break;
case "--n-test" when i + 1 < args.Length:
result.NTest = int.Parse(args[++i]);
break;
case "--voice-style" when i + 1 < args.Length:
result.VoiceStyle = args[++i].Split(',').ToList();
break;
case "--text" when i + 1 < args.Length:
result.Text = args[++i].Split('|').ToList();
break;
case "--save-dir" when i + 1 < args.Length:
result.SaveDir = args[++i];
break;
}
}
return result;
}
static void Main(string[] args)
{
Console.WriteLine("=== TTS Inference with ONNX Runtime (C#) ===\n");
// --- 1. Parse arguments --- //
var parsedArgs = ParseArgs(args);
int totalStep = parsedArgs.TotalStep;
float speed = parsedArgs.Speed;
int nTest = parsedArgs.NTest;
string saveDir = parsedArgs.SaveDir;
var voiceStylePaths = parsedArgs.VoiceStyle;
var textList = parsedArgs.Text;
bool batch = parsedArgs.Batch;
if (voiceStylePaths.Count != textList.Count)
{
throw new ArgumentException(
$"Number of voice styles ({voiceStylePaths.Count}) must match number of texts ({textList.Count})");
}
int bsz = voiceStylePaths.Count;
// --- 2. Load Text to Speech --- //
var textToSpeech = Helper.LoadTextToSpeech(parsedArgs.OnnxDir, parsedArgs.UseGpu);
Console.WriteLine();
// --- 3. Load Voice Style --- //
var style = Helper.LoadVoiceStyle(voiceStylePaths, verbose: true);
// --- 4. Synthesize speech --- //
for (int n = 0; n < nTest; n++)
{
Console.WriteLine($"\n[{n + 1}/{nTest}] Starting synthesis...");
var (wav, duration) = Helper.Timer("Generating speech from text", () =>
{
if (batch)
{
return textToSpeech.Batch(textList, style, totalStep, speed);
}
else
{
return textToSpeech.Call(textList[0], style, totalStep, speed);
}
});
if (!Directory.Exists(saveDir))
{
Directory.CreateDirectory(saveDir);
}
for (int b = 0; b < bsz; b++)
{
string fname = $"{Helper.SanitizeFilename(textList[b], 20)}_{n + 1}.wav";
int wavLen = (int)(textToSpeech.SampleRate * duration[b]);
var wavOut = new float[wavLen];
Array.Copy(wav, b * wav.Length / bsz, wavOut, 0, Math.Min(wavLen, wav.Length / bsz));
string outputPath = Path.Combine(saveDir, fname);
Helper.WriteWavFile(outputPath, wavOut, textToSpeech.SampleRate);
Console.WriteLine($"Saved: {outputPath}");
}
}
Console.WriteLine("\n=== Synthesis completed successfully! ===");
}
}
}

View File

@@ -0,0 +1,881 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
namespace Supertonic
{
// ============================================================================
// Configuration classes
// ============================================================================
public class Config
{
public AEConfig AE { get; set; } = null;
public TTLConfig TTL { get; set; } = null;
public class AEConfig
{
public int SampleRate { get; set; }
public int BaseChunkSize { get; set; }
}
public class TTLConfig
{
public int ChunkCompressFactor { get; set; }
public int LatentDim { get; set; }
}
}
// ============================================================================
// Style class
// ============================================================================
public class Style
{
public float[] Ttl { get; set; }
public long[] TtlShape { get; set; }
public float[] Dp { get; set; }
public long[] DpShape { get; set; }
public Style(float[] ttl, long[] ttlShape, float[] dp, long[] dpShape)
{
Ttl = ttl;
TtlShape = ttlShape;
Dp = dp;
DpShape = dpShape;
}
}
// ============================================================================
// Unicode text processor
// ============================================================================
public class UnicodeProcessor
{
private readonly Dictionary<int, long> _indexer;
public UnicodeProcessor(string unicodeIndexerPath)
{
var json = File.ReadAllText(unicodeIndexerPath);
var indexerArray = JsonSerializer.Deserialize<long[]>(json) ?? throw new Exception("Failed to load indexer");
_indexer = new Dictionary<int, long>();
for (int i = 0; i < indexerArray.Length; i++)
{
_indexer[i] = indexerArray[i];
}
}
private static string RemoveEmojis(string text)
{
var result = new StringBuilder();
for (int i = 0; i < text.Length; i++)
{
int codePoint;
if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
{
// Get the full code point from surrogate pair
codePoint = char.ConvertToUtf32(text[i], text[i + 1]);
i++; // Skip the low surrogate
}
else
{
codePoint = text[i];
}
// Check if code point is in emoji ranges
bool isEmoji = (codePoint >= 0x1F600 && codePoint <= 0x1F64F) ||
(codePoint >= 0x1F300 && codePoint <= 0x1F5FF) ||
(codePoint >= 0x1F680 && codePoint <= 0x1F6FF) ||
(codePoint >= 0x1F700 && codePoint <= 0x1F77F) ||
(codePoint >= 0x1F780 && codePoint <= 0x1F7FF) ||
(codePoint >= 0x1F800 && codePoint <= 0x1F8FF) ||
(codePoint >= 0x1F900 && codePoint <= 0x1F9FF) ||
(codePoint >= 0x1FA00 && codePoint <= 0x1FA6F) ||
(codePoint >= 0x1FA70 && codePoint <= 0x1FAFF) ||
(codePoint >= 0x2600 && codePoint <= 0x26FF) ||
(codePoint >= 0x2700 && codePoint <= 0x27BF) ||
(codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF);
if (!isEmoji)
{
if (codePoint > 0xFFFF)
{
// Add back as surrogate pair
result.Append(char.ConvertFromUtf32(codePoint));
}
else
{
result.Append((char)codePoint);
}
}
}
return result.ToString();
}
private string PreprocessText(string text)
{
// TODO: Need advanced normalizer for better performance
text = text.Normalize(NormalizationForm.FormKD);
// FIXME: this should be fixed for non-English languages
// Remove emojis (wide Unicode range)
// C# doesn't support \u{...} syntax in regex, so we use character filtering instead
text = RemoveEmojis(text);
// Replace various dashes and symbols
var replacements = new Dictionary<string, string>
{
{"", "-"}, // en dash
{"", "-"}, // non-breaking hyphen
{"—", "-"}, // em dash
{"¯", " "}, // macron
{"_", " "}, // underscore
{"\u201C", "\""}, // left double quote
{"\u201D", "\""}, // right double quote
{"\u2018", "'"}, // left single quote
{"\u2019", "'"}, // right single quote
{"´", "'"}, // acute accent
{"`", "'"}, // grave accent
{"[", " "}, // left bracket
{"]", " "}, // right bracket
{"|", " "}, // vertical bar
{"/", " "}, // slash
{"#", " "}, // hash
{"→", " "}, // right arrow
{"←", " "}, // left arrow
};
foreach (var kvp in replacements)
{
text = text.Replace(kvp.Key, kvp.Value);
}
// Remove combining diacritics // FIXME: this should be fixed for non-English languages
text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", "");
// Remove special symbols
text = Regex.Replace(text, @"[♥☆♡©\\]", "");
// Replace known expressions
var exprReplacements = new Dictionary<string, string>
{
{"@", " at "},
{"e.g.,", "for example, "},
{"i.e.,", "that is, "},
};
foreach (var kvp in exprReplacements)
{
text = text.Replace(kvp.Key, kvp.Value);
}
// Fix spacing around punctuation
text = Regex.Replace(text, @" ,", ",");
text = Regex.Replace(text, @" \.", ".");
text = Regex.Replace(text, @" !", "!");
text = Regex.Replace(text, @" \?", "?");
text = Regex.Replace(text, @" ;", ";");
text = Regex.Replace(text, @" :", ":");
text = Regex.Replace(text, @" '", "'");
// Remove duplicate quotes
while (text.Contains("\"\""))
{
text = text.Replace("\"\"", "\"");
}
while (text.Contains("''"))
{
text = text.Replace("''", "'");
}
while (text.Contains("``"))
{
text = text.Replace("``", "`");
}
// Remove extra spaces
text = Regex.Replace(text, @"\s+", " ").Trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$"))
{
text += ".";
}
return text;
}
private int[] TextToUnicodeValues(string text)
{
return text.Select(c => (int)c).ToArray();
}
private float[][][] GetTextMask(long[] textIdsLengths)
{
return Helper.LengthToMask(textIdsLengths);
}
public (long[][] textIds, float[][][] textMask) Call(List<string> textList)
{
var processedTexts = textList.Select(t => PreprocessText(t)).ToList();
var textIdsLengths = processedTexts.Select(t => (long)t.Length).ToArray();
long maxLen = textIdsLengths.Max();
var textIds = new long[textList.Count][];
for (int i = 0; i < processedTexts.Count; i++)
{
textIds[i] = new long[maxLen];
var unicodeVals = TextToUnicodeValues(processedTexts[i]);
for (int j = 0; j < unicodeVals.Length; j++)
{
if (_indexer.TryGetValue(unicodeVals[j], out long val))
{
textIds[i][j] = val;
}
}
}
var textMask = GetTextMask(textIdsLengths);
return (textIds, textMask);
}
}
// ============================================================================
// TextToSpeech class
// ============================================================================
public class TextToSpeech
{
private readonly Config _cfgs;
private readonly UnicodeProcessor _textProcessor;
private readonly InferenceSession _dpOrt;
private readonly InferenceSession _textEncOrt;
private readonly InferenceSession _vectorEstOrt;
private readonly InferenceSession _vocoderOrt;
public readonly int SampleRate;
private readonly int _baseChunkSize;
private readonly int _chunkCompressFactor;
private readonly int _ldim;
public TextToSpeech(
Config cfgs,
UnicodeProcessor textProcessor,
InferenceSession dpOrt,
InferenceSession textEncOrt,
InferenceSession vectorEstOrt,
InferenceSession vocoderOrt)
{
_cfgs = cfgs;
_textProcessor = textProcessor;
_dpOrt = dpOrt;
_textEncOrt = textEncOrt;
_vectorEstOrt = vectorEstOrt;
_vocoderOrt = vocoderOrt;
SampleRate = cfgs.AE.SampleRate;
_baseChunkSize = cfgs.AE.BaseChunkSize;
_chunkCompressFactor = cfgs.TTL.ChunkCompressFactor;
_ldim = cfgs.TTL.LatentDim;
}
private (float[][][] noisyLatent, float[][][] latentMask) SampleNoisyLatent(float[] duration)
{
int bsz = duration.Length;
float wavLenMax = duration.Max() * SampleRate;
var wavLengths = duration.Select(d => (long)(d * SampleRate)).ToArray();
int chunkSize = _baseChunkSize * _chunkCompressFactor;
int latentLen = (int)((wavLenMax + chunkSize - 1) / chunkSize);
int latentDim = _ldim * _chunkCompressFactor;
// Generate random noise
var random = new Random();
var noisyLatent = new float[bsz][][];
for (int b = 0; b < bsz; b++)
{
noisyLatent[b] = new float[latentDim][];
for (int d = 0; d < latentDim; d++)
{
noisyLatent[b][d] = new float[latentLen];
for (int t = 0; t < latentLen; t++)
{
// Box-Muller transform for normal distribution
double u1 = 1.0 - random.NextDouble();
double u2 = 1.0 - random.NextDouble();
noisyLatent[b][d][t] = (float)(Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Cos(2.0 * Math.PI * u2));
}
}
}
var latentMask = Helper.GetLatentMask(wavLengths, _baseChunkSize, _chunkCompressFactor);
// Apply mask
for (int b = 0; b < bsz; b++)
{
for (int d = 0; d < latentDim; d++)
{
for (int t = 0; t < latentLen; t++)
{
noisyLatent[b][d][t] *= latentMask[b][0][t];
}
}
}
return (noisyLatent, latentMask);
}
private (float[] wav, float[] duration) _Infer(List<string> textList, Style style, int totalStep, float speed = 1.05f)
{
int bsz = textList.Count;
if (bsz != style.TtlShape[0])
{
throw new ArgumentException("Number of texts must match number of style vectors");
}
// Process text
var (textIds, textMask) = _textProcessor.Call(textList);
var textIdsShape = new long[] { bsz, textIds[0].Length };
var textMaskShape = new long[] { bsz, 1, textMask[0][0].Length };
var textIdsTensor = Helper.IntArrayToTensor(textIds, textIdsShape);
var textMaskTensor = Helper.ArrayToTensor(textMask, textMaskShape);
var styleTtlTensor = new DenseTensor<float>(style.Ttl, style.TtlShape.Select(x => (int)x).ToArray());
var styleDpTensor = new DenseTensor<float>(style.Dp, style.DpShape.Select(x => (int)x).ToArray());
// Run duration predictor
var dpInputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("text_ids", textIdsTensor),
NamedOnnxValue.CreateFromTensor("style_dp", styleDpTensor),
NamedOnnxValue.CreateFromTensor("text_mask", textMaskTensor)
};
using (var dpOutputs = _dpOrt.Run(dpInputs))
{
var durOnnx = dpOutputs.First(o => o.Name == "duration").AsTensor<float>().ToArray();
// Apply speed factor to duration
for (int i = 0; i < durOnnx.Length; i++)
{
durOnnx[i] /= speed;
}
// Run text encoder
var textEncInputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("text_ids", textIdsTensor),
NamedOnnxValue.CreateFromTensor("style_ttl", styleTtlTensor),
NamedOnnxValue.CreateFromTensor("text_mask", textMaskTensor)
};
using (var textEncOutputs = _textEncOrt.Run(textEncInputs))
{
var textEmbTensor = textEncOutputs.First(o => o.Name == "text_emb").AsTensor<float>();
// Sample noisy latent
var (xt, latentMask) = SampleNoisyLatent(durOnnx);
var latentShape = new long[] { bsz, xt[0].Length, xt[0][0].Length };
var latentMaskShape = new long[] { bsz, 1, latentMask[0][0].Length };
var totalStepArray = Enumerable.Repeat((float)totalStep, bsz).ToArray();
// Iterative denoising
for (int step = 0; step < totalStep; step++)
{
var currentStepArray = Enumerable.Repeat((float)step, bsz).ToArray();
var vectorEstInputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("noisy_latent", Helper.ArrayToTensor(xt, latentShape)),
NamedOnnxValue.CreateFromTensor("text_emb", textEmbTensor),
NamedOnnxValue.CreateFromTensor("style_ttl", styleTtlTensor),
NamedOnnxValue.CreateFromTensor("text_mask", textMaskTensor),
NamedOnnxValue.CreateFromTensor("latent_mask", Helper.ArrayToTensor(latentMask, latentMaskShape)),
NamedOnnxValue.CreateFromTensor("total_step", new DenseTensor<float>(totalStepArray, new int[] { bsz })),
NamedOnnxValue.CreateFromTensor("current_step", new DenseTensor<float>(currentStepArray, new int[] { bsz }))
};
using (var vectorEstOutputs = _vectorEstOrt.Run(vectorEstInputs))
{
var denoisedLatent = vectorEstOutputs.First(o => o.Name == "denoised_latent").AsTensor<float>();
// Update xt
int idx = 0;
for (int b = 0; b < bsz; b++)
{
for (int d = 0; d < xt[b].Length; d++)
{
for (int t = 0; t < xt[b][d].Length; t++)
{
xt[b][d][t] = denoisedLatent.GetValue(idx++);
}
}
}
}
}
// Run vocoder
var vocoderInputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("latent", Helper.ArrayToTensor(xt, latentShape))
};
using (var vocoderOutputs = _vocoderOrt.Run(vocoderInputs))
{
var wavTensor = vocoderOutputs.First(o => o.Name == "wav_tts").AsTensor<float>();
return (wavTensor.ToArray(), durOnnx);
}
}
}
}
public (float[] wav, float[] duration) Call(string text, Style style, int totalStep, float speed = 1.05f, float silenceDuration = 0.3f)
{
if (style.TtlShape[0] != 1)
{
throw new ArgumentException("Single speaker text to speech only supports single style");
}
var textList = Helper.ChunkText(text);
var wavCat = new List<float>();
float durCat = 0.0f;
foreach (var chunk in textList)
{
var (wav, duration) = _Infer(new List<string> { chunk }, style, totalStep, speed);
if (wavCat.Count == 0)
{
wavCat.AddRange(wav);
durCat = duration[0];
}
else
{
int silenceLen = (int)(silenceDuration * SampleRate);
var silence = new float[silenceLen];
wavCat.AddRange(silence);
wavCat.AddRange(wav);
durCat += duration[0] + silenceDuration;
}
}
return (wavCat.ToArray(), new float[] { durCat });
}
public (float[] wav, float[] duration) Batch(List<string> textList, Style style, int totalStep, float speed = 1.05f)
{
return _Infer(textList, style, totalStep, speed);
}
}
// ============================================================================
// Helper class with utility functions
// ============================================================================
public static class Helper
{
// ============================================================================
// Utility functions
// ============================================================================
public static float[][][] LengthToMask(long[] lengths, long maxLen = -1)
{
if (maxLen == -1)
{
maxLen = lengths.Max();
}
var mask = new float[lengths.Length][][];
for (int i = 0; i < lengths.Length; i++)
{
mask[i] = new float[1][];
mask[i][0] = new float[maxLen];
for (int j = 0; j < maxLen; j++)
{
mask[i][0][j] = j < lengths[i] ? 1.0f : 0.0f;
}
}
return mask;
}
public static float[][][] GetLatentMask(long[] wavLengths, int baseChunkSize, int chunkCompressFactor)
{
int latentSize = baseChunkSize * chunkCompressFactor;
var latentLengths = wavLengths.Select(len => (len + latentSize - 1) / latentSize).ToArray();
return LengthToMask(latentLengths);
}
// ============================================================================
// ONNX model loading
// ============================================================================
public static InferenceSession LoadOnnx(string onnxPath, SessionOptions opts)
{
return new InferenceSession(onnxPath, opts);
}
public static (InferenceSession dp, InferenceSession textEnc, InferenceSession vectorEst, InferenceSession vocoder)
LoadOnnxAll(string onnxDir, SessionOptions opts)
{
var dpPath = Path.Combine(onnxDir, "duration_predictor.onnx");
var textEncPath = Path.Combine(onnxDir, "text_encoder.onnx");
var vectorEstPath = Path.Combine(onnxDir, "vector_estimator.onnx");
var vocoderPath = Path.Combine(onnxDir, "vocoder.onnx");
return (
LoadOnnx(dpPath, opts),
LoadOnnx(textEncPath, opts),
LoadOnnx(vectorEstPath, opts),
LoadOnnx(vocoderPath, opts)
);
}
// ============================================================================
// Configuration loading
// ============================================================================
public static Config LoadCfgs(string onnxDir)
{
var cfgPath = Path.Combine(onnxDir, "tts.json");
var json = File.ReadAllText(cfgPath);
using (var doc = JsonDocument.Parse(json))
{
var root = doc.RootElement;
return new Config
{
AE = new Config.AEConfig
{
SampleRate = root.GetProperty("ae").GetProperty("sample_rate").GetInt32(),
BaseChunkSize = root.GetProperty("ae").GetProperty("base_chunk_size").GetInt32()
},
TTL = new Config.TTLConfig
{
ChunkCompressFactor = root.GetProperty("ttl").GetProperty("chunk_compress_factor").GetInt32(),
LatentDim = root.GetProperty("ttl").GetProperty("latent_dim").GetInt32()
}
};
}
}
public static UnicodeProcessor LoadTextProcessor(string onnxDir)
{
var unicodeIndexerPath = Path.Combine(onnxDir, "unicode_indexer.json");
return new UnicodeProcessor(unicodeIndexerPath);
}
// ============================================================================
// Voice style loading
// ============================================================================
public static Style LoadVoiceStyle(List<string> voiceStylePaths, bool verbose = false)
{
int bsz = voiceStylePaths.Count;
// Read first file to get dimensions
var firstJson = File.ReadAllText(voiceStylePaths[0]);
using (var firstDoc = JsonDocument.Parse(firstJson))
{
var firstRoot = firstDoc.RootElement;
var ttlDims = ParseInt64Array(firstRoot.GetProperty("style_ttl").GetProperty("dims"));
var dpDims = ParseInt64Array(firstRoot.GetProperty("style_dp").GetProperty("dims"));
long ttlDim1 = ttlDims[1];
long ttlDim2 = ttlDims[2];
long dpDim1 = dpDims[1];
long dpDim2 = dpDims[2];
// Pre-allocate arrays with full batch size
int ttlSize = (int)(bsz * ttlDim1 * ttlDim2);
int dpSize = (int)(bsz * dpDim1 * dpDim2);
var ttlFlat = new float[ttlSize];
var dpFlat = new float[dpSize];
// Fill in the data
for (int i = 0; i < bsz; i++)
{
var json = File.ReadAllText(voiceStylePaths[i]);
using (var doc = JsonDocument.Parse(json))
{
var root = doc.RootElement;
// Flatten data
var ttlData3D = ParseFloat3DArray(root.GetProperty("style_ttl").GetProperty("data"));
var ttlDataFlat = new List<float>();
foreach (var batch in ttlData3D)
{
foreach (var row in batch)
{
ttlDataFlat.AddRange(row);
}
}
var dpData3D = ParseFloat3DArray(root.GetProperty("style_dp").GetProperty("data"));
var dpDataFlat = new List<float>();
foreach (var batch in dpData3D)
{
foreach (var row in batch)
{
dpDataFlat.AddRange(row);
}
}
// Copy to pre-allocated array
int ttlOffset = (int)(i * ttlDim1 * ttlDim2);
ttlDataFlat.CopyTo(ttlFlat, ttlOffset);
int dpOffset = (int)(i * dpDim1 * dpDim2);
dpDataFlat.CopyTo(dpFlat, dpOffset);
}
}
var ttlShape = new long[] { bsz, ttlDim1, ttlDim2 };
var dpShape = new long[] { bsz, dpDim1, dpDim2 };
if (verbose)
{
Console.WriteLine($"Loaded {bsz} voice styles");
}
return new Style(ttlFlat, ttlShape, dpFlat, dpShape);
}
}
private static float[][][] ParseFloat3DArray(JsonElement element)
{
var result = new List<float[][]>();
foreach (var batch in element.EnumerateArray())
{
var batch2D = new List<float[]>();
foreach (var row in batch.EnumerateArray())
{
var rowData = new List<float>();
foreach (var val in row.EnumerateArray())
{
rowData.Add(val.GetSingle());
}
batch2D.Add(rowData.ToArray());
}
result.Add(batch2D.ToArray());
}
return result.ToArray();
}
private static long[] ParseInt64Array(JsonElement element)
{
var result = new List<long>();
foreach (var val in element.EnumerateArray())
{
result.Add(val.GetInt64());
}
return result.ToArray();
}
// ============================================================================
// TextToSpeech loading
// ============================================================================
public static TextToSpeech LoadTextToSpeech(string onnxDir, bool useGpu = false)
{
var opts = new SessionOptions();
if (useGpu)
{
throw new NotImplementedException("GPU mode is not supported yet");
}
else
{
Console.WriteLine("Using CPU for inference");
}
var cfgs = LoadCfgs(onnxDir);
var (dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) = LoadOnnxAll(onnxDir, opts);
var textProcessor = LoadTextProcessor(onnxDir);
return new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt);
}
// ============================================================================
// WAV file writing
// ============================================================================
public static void WriteWavFile(string filename, float[] audioData, int sampleRate)
{
using (var writer = new BinaryWriter(File.Open(filename, FileMode.Create)))
{
int numChannels = 1;
int bitsPerSample = 16;
int byteRate = sampleRate * numChannels * bitsPerSample / 8;
short blockAlign = (short)(numChannels * bitsPerSample / 8);
int dataSize = audioData.Length * bitsPerSample / 8;
// RIFF header
writer.Write(Encoding.ASCII.GetBytes("RIFF"));
writer.Write(36 + dataSize);
writer.Write(Encoding.ASCII.GetBytes("WAVE"));
// fmt chunk
writer.Write(Encoding.ASCII.GetBytes("fmt "));
writer.Write(16); // fmt chunk size
writer.Write((short)1); // audio format (PCM)
writer.Write((short)numChannels);
writer.Write(sampleRate);
writer.Write(byteRate);
writer.Write(blockAlign);
writer.Write((short)bitsPerSample);
// data chunk
writer.Write(Encoding.ASCII.GetBytes("data"));
writer.Write(dataSize);
// Write audio data
foreach (var sample in audioData)
{
float clamped = Math.Max(-1.0f, Math.Min(1.0f, sample));
short intSample = (short)(clamped * 32767);
writer.Write(intSample);
}
}
}
// ============================================================================
// Tensor conversion utilities
// ============================================================================
public static DenseTensor<float> ArrayToTensor(float[][][] array, long[] dims)
{
var flat = new List<float>();
foreach (var batch in array)
{
foreach (var row in batch)
{
flat.AddRange(row);
}
}
return new DenseTensor<float>(flat.ToArray(), dims.Select(x => (int)x).ToArray());
}
public static DenseTensor<long> IntArrayToTensor(long[][] array, long[] dims)
{
var flat = new List<long>();
foreach (var row in array)
{
flat.AddRange(row);
}
return new DenseTensor<long>(flat.ToArray(), dims.Select(x => (int)x).ToArray());
}
// ============================================================================
// Timer utility
// ============================================================================
public static T Timer<T>(string name, Func<T> func)
{
var start = DateTime.Now;
Console.WriteLine($"{name}...");
var result = func();
var elapsed = (DateTime.Now - start).TotalSeconds;
Console.WriteLine($" -> {name} completed in {elapsed:F2} sec");
return result;
}
public static string SanitizeFilename(string text, int maxLen)
{
var result = new StringBuilder();
int count = 0;
foreach (char c in text)
{
if (count >= maxLen) break;
if (char.IsLetterOrDigit(c))
{
result.Append(c);
}
else
{
result.Append('_');
}
count++;
}
return result.ToString();
}
// ============================================================================
// Chunk text
// ============================================================================
public static List<string> ChunkText(string text, int maxLen = 300)
{
var chunks = new List<string>();
// Split by paragraph (two or more newlines)
var paragraphRegex = new Regex(@"\n\s*\n+");
var paragraphs = paragraphRegex.Split(text.Trim())
.Select(p => p.Trim())
.Where(p => !string.IsNullOrEmpty(p))
.ToList();
// Split by sentence boundaries, excluding abbreviations
var sentenceRegex = new Regex(@"(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+");
foreach (var paragraph in paragraphs)
{
var sentences = sentenceRegex.Split(paragraph);
string currentChunk = "";
foreach (var sentence in sentences)
{
if (string.IsNullOrEmpty(sentence)) continue;
if (currentChunk.Length + sentence.Length + 1 <= maxLen)
{
if (!string.IsNullOrEmpty(currentChunk))
{
currentChunk += " ";
}
currentChunk += sentence;
}
else
{
if (!string.IsNullOrEmpty(currentChunk))
{
chunks.Add(currentChunk.Trim());
}
currentChunk = sentence;
}
}
if (!string.IsNullOrEmpty(currentChunk))
{
chunks.Add(currentChunk.Trim());
}
}
// If no chunks were created, return the original text
if (chunks.Count == 0)
{
chunks.Add(text.Trim());
}
return chunks;
}
}
}

View File

@@ -0,0 +1,36 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// 어셈블리에 대한 일반 정보는 다음 특성 집합을 통해
// 제어됩니다. 어셈블리와 관련된 정보를 수정하려면
// 이러한 특성 값을 변경하세요.
[assembly: AssemblyTitle("ClassLibrary1")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("ClassLibrary1")]
[assembly: AssemblyCopyright("Copyright © 2025")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// ComVisible을 false로 설정하면 이 어셈블리의 형식이 COM 구성 요소에
// 표시되지 않습니다. COM에서 이 어셈블리의 형식에 액세스하려면
// 해당 형식에 대해 ComVisible 특성을 true로 설정하세요.
[assembly: ComVisible(false)]
// 이 프로젝트가 COM에 노출되는 경우 다음 GUID는 typelib의 ID를 나타냅니다.
[assembly: Guid("19675e19-eb91-493e-88c3-32b3c094b749")]
// 어셈블리의 버전 정보는 다음 네 가지 값으로 구성됩니다.
//
// 주 버전
// 부 버전
// 빌드 번호
// 수정 버전
//
// 모든 값을 지정하거나 아래와 같이 '*'를 사용하여 빌드 번호 및 수정 번호를
// 기본값으로 할 수 있습니다.
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]

View File

@@ -0,0 +1,126 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.props" Condition="Exists('..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.props')" />
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{19675E19-EB91-493E-88C3-32B3C094B749}</ProjectGuid>
<OutputType>Exe</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Supertonic</RootNamespace>
<AssemblyName>Supertonic.Net48</AssemblyName>
<TargetFrameworkVersion>v4.8</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<Deterministic>true</Deterministic>
<NuGetPackageImportStamp>
</NuGetPackageImportStamp>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup>
<StartupObject>Supertonic.Program</StartupObject>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|Win32'">
<PlatformTarget>x64</PlatformTarget>
<OutputPath>bin\Debug\</OutputPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
<DebugSymbols>true</DebugSymbols>
<OutputPath>bin\x64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<DebugType>full</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
<OutputPath>bin\x64\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<Optimize>true</Optimize>
<DebugType>pdbonly</DebugType>
<PlatformTarget>x64</PlatformTarget>
<ErrorReport>prompt</ErrorReport>
<CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
<Prefer32Bit>true</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
<Reference Include="Microsoft.Bcl.AsyncInterfaces, Version=10.0.0.1, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\Microsoft.Bcl.AsyncInterfaces.10.0.1\lib\net462\Microsoft.Bcl.AsyncInterfaces.dll</HintPath>
</Reference>
<Reference Include="Microsoft.ML.OnnxRuntime, Version=1.23.2.0, Culture=neutral, PublicKeyToken=f27f157f0a5b7bb6, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\Microsoft.ML.OnnxRuntime.Managed.1.23.2\lib\netstandard2.0\Microsoft.ML.OnnxRuntime.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Buffers, Version=4.0.5.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.Buffers.4.6.1\lib\net462\System.Buffers.dll</HintPath>
</Reference>
<Reference Include="System.Core" />
<Reference Include="System.IO.Pipelines, Version=10.0.0.1, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.IO.Pipelines.10.0.1\lib\net462\System.IO.Pipelines.dll</HintPath>
</Reference>
<Reference Include="System.Memory, Version=4.0.5.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.Memory.4.6.3\lib\net462\System.Memory.dll</HintPath>
</Reference>
<Reference Include="System.Numerics" />
<Reference Include="System.Numerics.Vectors, Version=4.1.6.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.Numerics.Vectors.4.6.1\lib\net462\System.Numerics.Vectors.dll</HintPath>
</Reference>
<Reference Include="System.Runtime.CompilerServices.Unsafe, Version=6.0.3.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.Runtime.CompilerServices.Unsafe.6.1.2\lib\net462\System.Runtime.CompilerServices.Unsafe.dll</HintPath>
</Reference>
<Reference Include="System.Text.Encodings.Web, Version=10.0.0.1, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.Text.Encodings.Web.10.0.1\lib\net462\System.Text.Encodings.Web.dll</HintPath>
</Reference>
<Reference Include="System.Text.Json, Version=10.0.0.1, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.Text.Json.10.0.1\lib\net462\System.Text.Json.dll</HintPath>
</Reference>
<Reference Include="System.Threading.Tasks.Extensions, Version=4.2.4.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
<HintPath>..\csharp\packages\System.Threading.Tasks.Extensions.4.6.3\lib\net462\System.Threading.Tasks.Extensions.dll</HintPath>
</Reference>
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="ExampleONNX.cs" />
<Compile Include="Helper.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="app.config" />
<None Include="packages.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<Import Project="..\csharp\packages\Microsoft.ML.OnnxRuntime.Managed.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.Managed.targets" Condition="Exists('..\csharp\packages\Microsoft.ML.OnnxRuntime.Managed.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.Managed.targets')" />
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
<ErrorText>이 프로젝트는 이 컴퓨터에 없는 NuGet 패키지를 참조합니다. 해당 패키지를 다운로드하려면 NuGet 패키지 복원을 사용하십시오. 자세한 내용은 http://go.microsoft.com/fwlink/?LinkID=322105를 참조하십시오. 누락된 파일은 {0}입니다.</ErrorText>
</PropertyGroup>
<Error Condition="!Exists('..\csharp\packages\Microsoft.ML.OnnxRuntime.Managed.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.Managed.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\csharp\packages\Microsoft.ML.OnnxRuntime.Managed.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.Managed.targets'))" />
<Error Condition="!Exists('..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.props')" Text="$([System.String]::Format('$(ErrorText)', '..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.props'))" />
<Error Condition="!Exists('..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.targets'))" />
<Error Condition="!Exists('..\csharp\packages\System.ValueTuple.4.6.1\build\net471\System.ValueTuple.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\csharp\packages\System.ValueTuple.4.6.1\build\net471\System.ValueTuple.targets'))" />
</Target>
<Import Project="..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.targets" Condition="Exists('..\csharp\packages\Microsoft.ML.OnnxRuntime.1.23.2\build\netstandard2.0\Microsoft.ML.OnnxRuntime.targets')" />
<Import Project="..\csharp\packages\System.ValueTuple.4.6.1\build\net471\System.ValueTuple.targets" Condition="Exists('..\csharp\packages\System.ValueTuple.4.6.1\build\net471\System.ValueTuple.targets')" />
</Project>

View File

@@ -0,0 +1,49 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Express 15 for Windows Desktop
VisualStudioVersion = 15.0.36324.19
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Supertonic.Netfx48", "Supertonic.Netfx48.csproj", "{19675E19-EB91-493E-88C3-32B3C094B749}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|ARM = Debug|ARM
Debug|ARM64 = Debug|ARM64
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|Any CPU = Release|Any CPU
Release|ARM = Release|ARM
Release|ARM64 = Release|ARM64
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|Any CPU.Build.0 = Debug|Any CPU
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|ARM.ActiveCfg = Debug|ARM
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|ARM.Build.0 = Debug|ARM
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|ARM64.ActiveCfg = Debug|ARM64
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|ARM64.Build.0 = Debug|ARM64
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|x64.ActiveCfg = Debug|x64
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|x64.Build.0 = Debug|x64
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|x86.ActiveCfg = Debug|Win32
{19675E19-EB91-493E-88C3-32B3C094B749}.Debug|x86.Build.0 = Debug|Win32
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|Any CPU.ActiveCfg = Release|Any CPU
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|Any CPU.Build.0 = Release|Any CPU
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|ARM.ActiveCfg = Release|ARM
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|ARM.Build.0 = Release|ARM
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|ARM64.ActiveCfg = Release|ARM64
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|ARM64.Build.0 = Release|ARM64
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|x64.ActiveCfg = Release|x64
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|x64.Build.0 = Release|x64
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|x86.ActiveCfg = Release|Win32
{19675E19-EB91-493E-88C3-32B3C094B749}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {5F2E20C5-E704-4B99-8FE9-54394113916E}
EndGlobalSection
EndGlobal

View File

@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
<dependentAssembly>
<assemblyIdentity name="System.Memory" publicKeyToken="cc7b13ffcd2ddd51" culture="neutral" />
<bindingRedirect oldVersion="0.0.0.0-4.0.5.0" newVersion="4.0.5.0" />
</dependentAssembly>
</assemblyBinding>
</runtime>
</configuration>

View File

@@ -0,0 +1,316 @@
{
"tts_version": "v1.5.0",
"split": "opensource-en",
"ttl_ckpt_path": "unknown.pt",
"dp_ckpt_path": "unknown.pt",
"ae_ckpt_path": "unknown.pt",
"ttl_train": "unknown",
"dp_train": "unknown",
"ae_train": "unknown",
"ttl": {
"latent_dim": 24,
"chunk_compress_factor": 6,
"batch_expander": {
"n_batch_expand": 6
},
"normalizer": {
"scale": 0.25
},
"text_encoder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"text_embedder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"char_emb_dim": 256
},
"convnext": {
"idim": 256,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"attn_encoder": {
"hidden_channels": 256,
"filter_channels": 1024,
"n_heads": 4,
"n_layers": 4,
"p_dropout": 0.0
},
"proj_out": {
"idim": 256,
"odim": 256
}
},
"flow_matching": {
"sig_min": 0
},
"style_encoder": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 256
},
"convnext": {
"idim": 256,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"style_token_layer": {
"input_dim": 256,
"n_style": 50,
"style_key_dim": 256,
"style_value_dim": 256,
"prototype_dim": 256,
"n_units": 256,
"n_heads": 2
}
},
"speech_prompted_text_encoder": {
"text_dim": 256,
"style_dim": 256,
"n_units": 256,
"n_heads": 2
},
"uncond_masker": {
"prob_both_uncond": 0.04,
"prob_text_uncond": 0.01,
"std": 0.1,
"text_dim": 256,
"n_style": 50,
"style_key_dim": 256,
"style_value_dim": 256
},
"vector_field": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 512
},
"time_encoder": {
"time_dim": 64,
"hdim": 256
},
"main_blocks": {
"n_blocks": 4,
"time_cond_layer": {
"idim": 512,
"time_dim": 64
},
"style_cond_layer": {
"idim": 512,
"style_dim": 256
},
"text_cond_layer": {
"idim": 512,
"text_dim": 256,
"n_heads": 4,
"use_residual": true,
"rotary_base": 10000,
"rotary_scale": 10
},
"convnext_0": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 4,
"dilation_lst": [
1,
2,
4,
8
]
},
"convnext_1": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 1,
"dilation_lst": [
1
]
},
"convnext_2": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 1,
"dilation_lst": [
1
]
}
},
"last_convnext": {
"idim": 512,
"ksz": 5,
"intermediate_dim": 1024,
"num_layers": 4,
"dilation_lst": [
1,
1,
1,
1
]
},
"proj_out": {
"idim": 512,
"chunk_compress_factor": 6,
"ldim": 24
}
}
},
"ae": {
"sample_rate": 44100,
"n_delay": 0,
"base_chunk_size": 512,
"chunk_compress_factor": 1,
"ldim": 24,
"encoder": {
"spec_processor": {
"n_fft": 2048,
"win_length": 2048,
"hop_length": 512,
"n_mels": 228,
"sample_rate": 44100,
"eps": 1e-05,
"norm_mean": 0.0,
"norm_std": 1.0
},
"ksz_init": 7,
"ksz": 7,
"num_layers": 10,
"dilation_lst": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1
],
"intermediate_dim": 2048,
"idim": 1253,
"hdim": 512,
"odim": 24
},
"decoder": {
"ksz_init": 7,
"ksz": 7,
"num_layers": 10,
"dilation_lst": [
1,
2,
4,
1,
2,
4,
1,
1,
1,
1
],
"intermediate_dim": 2048,
"idim": 24,
"hdim": 512,
"head": {
"idim": 512,
"hdim": 2048,
"odim": 512,
"ksz": 3
}
}
},
"dp": {
"latent_dim": 24,
"chunk_compress_factor": 6,
"normalizer": {
"scale": 1.0
},
"sentence_encoder": {
"char_emb_dim": 64,
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"text_embedder": {
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
"char_emb_dim": 64
},
"convnext": {
"idim": 64,
"ksz": 5,
"intermediate_dim": 256,
"num_layers": 6,
"dilation_lst": [
1,
1,
1,
1,
1,
1
]
},
"attn_encoder": {
"hidden_channels": 64,
"filter_channels": 256,
"n_heads": 2,
"n_layers": 2,
"p_dropout": 0.0
},
"proj_out": {
"idim": 64,
"odim": 64
}
},
"style_encoder": {
"proj_in": {
"ldim": 24,
"chunk_compress_factor": 6,
"odim": 64
},
"convnext": {
"idim": 64,
"ksz": 5,
"intermediate_dim": 256,
"num_layers": 4,
"dilation_lst": [
1,
1,
1,
1
]
},
"style_token_layer": {
"input_dim": 64,
"n_style": 8,
"style_key_dim": 0,
"style_value_dim": 16,
"prototype_dim": 64,
"n_units": 64,
"n_heads": 2
}
},
"predictor": {
"sentence_dim": 64,
"n_style": 8,
"style_dim": 16,
"hdim": 128,
"n_layer": 2
}
}
}

View File

@@ -0,0 +1,223 @@
tts_version: "v1.5.0"
split: "opensource-en"
ttl_ckpt_path: "unknown.pt"
dp_ckpt_path: "unknown.pt"
ae_ckpt_path: "unknown.pt"
ttl_train: "unknown"
dp_train: "unknown"
ae_train: "unknown"
ttl:
latent_dim: 24
chunk_compress_factor: 6
batch_expander:
n_batch_expand: 6
normalizer:
scale: 0.25
text_encoder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
text_embedder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
char_emb_dim: 256
convnext:
idim: 256
ksz: 5
intermediate_dim: 1024
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
attn_encoder:
hidden_channels: 256
filter_channels: 1024
n_heads: 4
n_layers: 4
p_dropout: 0.0
proj_out:
idim: 256
odim: 256
flow_matching:
sig_min: 0
style_encoder:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 256
convnext:
idim: 256
ksz: 5
intermediate_dim: 1024
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
style_token_layer:
input_dim: 256
n_style: 50
style_key_dim: 256
style_value_dim: 256
prototype_dim: 256
n_units: 256
n_heads: 2
speech_prompted_text_encoder:
text_dim: 256
style_dim: 256
n_units: 256
n_heads: 2
uncond_masker:
prob_both_uncond: 0.04
prob_text_uncond: 0.01
std: 0.1
text_dim: 256
n_style: 50
style_key_dim: 256
style_value_dim: 256
vector_field:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 512
time_encoder:
time_dim: 64
hdim: 256
main_blocks:
n_blocks: 4
time_cond_layer:
idim: 512
time_dim: 64
style_cond_layer:
idim: 512
style_dim: 256
text_cond_layer:
idim: 512
text_dim: 256
n_heads: 4
use_residual: True
rotary_base: 10000
rotary_scale: 10
convnext_0:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 4
dilation_lst: [1, 2, 4, 8]
convnext_1:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 1
dilation_lst: [1]
convnext_2:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 1
dilation_lst: [1]
last_convnext:
idim: 512
ksz: 5
intermediate_dim: 1024
num_layers: 4
dilation_lst: [1, 1, 1, 1]
proj_out:
idim: 512
chunk_compress_factor: 6
ldim: 24
ae:
sample_rate: 44100
n_delay: 0
base_chunk_size: 512
chunk_compress_factor: 1
ldim: 24
encoder:
spec_processor:
n_fft: 2048
win_length: 2048
hop_length: 512
n_mels: 228
sample_rate: 44100
eps: 1e-05
norm_mean: 0.0
norm_std: 1.0
ksz_init: 7
ksz: 7
num_layers: 10
dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
intermediate_dim: 2048
idim: 1253
hdim: 512
odim: 24
decoder:
ksz_init: 7
ksz: 7
num_layers: 10
dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
intermediate_dim: 2048
idim: 24
hdim: 512
head:
idim: 512
hdim: 2048
odim: 512
ksz: 3
dp:
latent_dim: 24
chunk_compress_factor: 6
normalizer:
scale: 1.0
sentence_encoder:
char_emb_dim: 64
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
text_embedder:
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
char_emb_dim: 64
convnext:
idim: 64
ksz: 5
intermediate_dim: 256
num_layers: 6
dilation_lst: [1, 1, 1, 1, 1, 1]
attn_encoder:
hidden_channels: 64
filter_channels: 256
n_heads: 2
n_layers: 2
p_dropout: 0.0
proj_out:
idim: 64
odim: 64
style_encoder:
proj_in:
ldim: 24
chunk_compress_factor: 6
odim: 64
convnext:
idim: 64
ksz: 5
intermediate_dim: 256
num_layers: 4
dilation_lst: [1, 1, 1, 1]
style_token_layer:
input_dim: 64
n_style: 8
style_key_dim: 0
style_value_dim: 16
prototype_dim: 64
n_units: 64
n_heads: 2
predictor:
sentence_dim: 64
n_style: 8
style_dim: 16
hdim: 128
n_layer: 2
unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="Microsoft.Bcl.AsyncInterfaces" version="10.0.1" targetFramework="net48" />
<package id="Microsoft.ML.OnnxRuntime" version="1.23.2" targetFramework="net48" />
<package id="Microsoft.ML.OnnxRuntime.Managed" version="1.23.2" targetFramework="net48" />
<package id="System.Buffers" version="4.6.1" targetFramework="net48" />
<package id="System.IO.Pipelines" version="10.0.1" targetFramework="net48" />
<package id="System.Memory" version="4.6.3" targetFramework="net48" />
<package id="System.Numerics.Vectors" version="4.6.1" targetFramework="net48" />
<package id="System.Runtime.CompilerServices.Unsafe" version="6.1.2" targetFramework="net48" />
<package id="System.Text.Encodings.Web" version="10.0.1" targetFramework="net48" />
<package id="System.Text.Json" version="10.0.1" targetFramework="net48" />
<package id="System.Threading.Tasks.Extensions" version="4.6.3" targetFramework="net48" />
<package id="System.ValueTuple" version="4.6.1" targetFramework="net48" />
</packages>