import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; import * as ort from 'onnxruntime-node'; const __filename = fileURLToPath(import.meta.url); const AVAILABLE_LANGS = ["en", "ko", "es", "pt", "fr"]; /** * Unicode text processor */ class UnicodeProcessor { constructor(unicodeIndexerJsonPath) { this.indexer = JSON.parse(fs.readFileSync(unicodeIndexerJsonPath, 'utf8')); } _preprocessText(text, lang) { // TODO: Need advanced normalizer for better performance text = text.normalize('NFKD'); // Remove emojis (wide Unicode range) const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu; text = text.replace(emojiPattern, ''); // Replace various dashes and symbols const replacements = { '–': '-', '‑': '-', '—': '-', '_': ' ', '\u201C': '"', // left double quote " '\u201D': '"', // right double quote " '\u2018': "'", // left single quote ' '\u2019': "'", // right single quote ' '´': "'", '`': "'", '[': ' ', ']': ' ', '|': ' ', '/': ' ', '#': ' ', '→': ' ', '←': ' ', }; for (const [k, v] of Object.entries(replacements)) { text = text.replaceAll(k, v); } // Remove special symbols text = text.replace(/[♥☆♡©\\]/g, ''); // Replace known expressions const exprReplacements = { '@': ' at ', 'e.g.,': 'for example, ', 'i.e.,': 'that is, ', }; for (const [k, v] of Object.entries(exprReplacements)) { text = text.replaceAll(k, v); } // Fix spacing around punctuation text = text.replace(/ ,/g, ','); text = text.replace(/ \./g, '.'); text = text.replace(/ !/g, '!'); text = text.replace(/ \?/g, '?'); text = text.replace(/ ;/g, ';'); text = text.replace(/ :/g, ':'); text = text.replace(/ '/g, "'"); // Remove duplicate quotes while (text.includes('""')) { text = text.replace('""', '"'); } while (text.includes("''")) { text = text.replace("''", "'"); } while (text.includes('``')) { text = text.replace('``', '`'); } // Remove extra spaces text = text.replace(/\s+/g, ' ').trim(); // If text doesn't end with punctuation, quotes, or closing brackets, add a period if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) { text += '.'; } // Validate language if (!AVAILABLE_LANGS.includes(lang)) { throw new Error(`Invalid language: ${lang}. Available: ${AVAILABLE_LANGS.join(', ')}`); } // Wrap text with language tags text = `<${lang}>` + text + ``; return text; } _textToUnicodeValues(text) { return Array.from(text).map(char => char.charCodeAt(0)); } _getTextMask(textIdsLengths) { return lengthToMask(textIdsLengths); } call(textList, langList) { const processedTexts = textList.map((t, i) => this._preprocessText(t, langList[i])); const textIdsLengths = processedTexts.map(t => t.length); const maxLen = Math.max(...textIdsLengths); const textIds = []; for (let i = 0; i < processedTexts.length; i++) { const row = new Array(maxLen).fill(0); const unicodeVals = this._textToUnicodeValues(processedTexts[i]); for (let j = 0; j < unicodeVals.length; j++) { row[j] = this.indexer[unicodeVals[j]]; } textIds.push(row); } const textMask = this._getTextMask(textIdsLengths); return { textIds, textMask }; } } /** * Style class */ class Style { constructor(styleTtlOnnx, styleDpOnnx) { this.ttl = styleTtlOnnx; this.dp = styleDpOnnx; } } /** * TextToSpeech class */ class TextToSpeech { constructor(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) { this.cfgs = cfgs; this.textProcessor = textProcessor; this.dpOrt = dpOrt; this.textEncOrt = textEncOrt; this.vectorEstOrt = vectorEstOrt; this.vocoderOrt = vocoderOrt; this.sampleRate = cfgs.ae.sample_rate; this.baseChunkSize = cfgs.ae.base_chunk_size; this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor; this.ldim = cfgs.ttl.latent_dim; } sampleNoisyLatent(duration) { const wavLenMax = Math.max(...duration) * this.sampleRate; const wavLengths = duration.map(d => Math.floor(d * this.sampleRate)); const chunkSize = this.baseChunkSize * this.chunkCompressFactor; const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize); const latentDim = this.ldim * this.chunkCompressFactor; // Generate random noise const noisyLatent = []; for (let b = 0; b < duration.length; b++) { const batch = []; for (let d = 0; d < latentDim; d++) { const row = []; for (let t = 0; t < latentLen; t++) { // Box-Muller transform for normal distribution // Add epsilon to avoid log(0) const eps = 1e-10; const u1 = Math.max(eps, Math.random()); const u2 = Math.random(); const randNormal = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2); row.push(randNormal); } batch.push(row); } noisyLatent.push(batch); } const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor); // Apply mask for (let b = 0; b < noisyLatent.length; b++) { for (let d = 0; d < noisyLatent[b].length; d++) { for (let t = 0; t < noisyLatent[b][d].length; t++) { noisyLatent[b][d][t] *= latentMask[b][0][t]; } } } return { noisyLatent, latentMask }; } async _infer(textList, langList, style, totalStep, speed = 1.05) { if (textList.length !== style.ttl.dims[0]) { throw new Error('Number of texts must match number of style vectors'); } const bsz = textList.length; const { textIds, textMask } = this.textProcessor.call(textList, langList); const textIdsShape = [bsz, textIds[0].length]; const textMaskShape = [bsz, 1, textMask[0][0].length]; const textMaskTensor = arrayToTensor(textMask, textMaskShape); const dpResult = await this.dpOrt.run({ text_ids: intArrayToTensor(textIds, textIdsShape), style_dp: style.dp, text_mask: textMaskTensor }); const durOnnx = Array.from(dpResult.duration.data); // Apply speed factor to duration for (let i = 0; i < durOnnx.length; i++) { durOnnx[i] /= speed; } const textEncResult = await this.textEncOrt.run({ text_ids: intArrayToTensor(textIds, textIdsShape), style_ttl: style.ttl, text_mask: textMaskTensor }); const textEmbTensor = textEncResult.text_emb; let { noisyLatent, latentMask } = this.sampleNoisyLatent(durOnnx); const latentShape = [bsz, noisyLatent[0].length, noisyLatent[0][0].length]; const latentMaskShape = [bsz, 1, latentMask[0][0].length]; const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape); const totalStepArray = new Array(bsz).fill(totalStep); const scalarShape = [bsz]; const totalStepTensor = arrayToTensor(totalStepArray, scalarShape); for (let step = 0; step < totalStep; step++) { const currentStepArray = new Array(bsz).fill(step); const vectorEstResult = await this.vectorEstOrt.run({ noisy_latent: arrayToTensor(noisyLatent, latentShape), text_emb: textEmbTensor, style_ttl: style.ttl, text_mask: textMaskTensor, latent_mask: latentMaskTensor, total_step: totalStepTensor, current_step: arrayToTensor(currentStepArray, scalarShape) }); const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data); // Update latent with the denoised output let idx = 0; for (let b = 0; b < noisyLatent.length; b++) { for (let d = 0; d < noisyLatent[b].length; d++) { for (let t = 0; t < noisyLatent[b][d].length; t++) { noisyLatent[b][d][t] = denoisedLatent[idx++]; } } } } const vocoderResult = await this.vocoderOrt.run({ latent: arrayToTensor(noisyLatent, latentShape) }); const wav = Array.from(vocoderResult.wav_tts.data); return { wav, duration: durOnnx }; } async call(text, lang, style, totalStep, speed = 1.05, silenceDuration = 0.3) { if (style.ttl.dims[0] !== 1) { throw new Error('Single speaker text to speech only supports single style'); } const maxLen = lang === 'ko' ? 120 : 300; const textList = chunkText(text, maxLen); let wavCat = null; let durCat = 0; for (const chunk of textList) { const { wav, duration } = await this._infer([chunk], [lang], style, totalStep, speed); if (wavCat === null) { wavCat = wav; durCat = duration[0]; } else { const silenceLen = Math.floor(silenceDuration * this.sampleRate); const silence = new Array(silenceLen).fill(0); wavCat = [...wavCat, ...silence, ...wav]; durCat += duration[0] + silenceDuration; } } return { wav: wavCat, duration: [durCat] }; } async batch(textList, langList, style, totalStep, speed = 1.05) { return await this._infer(textList, langList, style, totalStep, speed); } } /** * Convert lengths to binary mask */ function lengthToMask(lengths, maxLen = null) { maxLen = maxLen || Math.max(...lengths); const mask = []; for (let i = 0; i < lengths.length; i++) { const row = []; for (let j = 0; j < maxLen; j++) { row.push(j < lengths[i] ? 1.0 : 0.0); } mask.push([row]); // [B, 1, maxLen] } return mask; } /** * Get latent mask from wav lengths */ function getLatentMask(wavLengths, baseChunkSize, chunkCompressFactor) { const latentSize = baseChunkSize * chunkCompressFactor; const latentLengths = wavLengths.map(len => Math.floor((len + latentSize - 1) / latentSize) ); return lengthToMask(latentLengths); } /** * Load ONNX model */ async function loadOnnx(onnxPath, opts) { return await ort.InferenceSession.create(onnxPath, opts); } /** * Load all ONNX models for TTS */ async function loadOnnxAll(onnxDir, opts) { const dpPath = path.join(onnxDir, 'duration_predictor.onnx'); const textEncPath = path.join(onnxDir, 'text_encoder.onnx'); const vectorEstPath = path.join(onnxDir, 'vector_estimator.onnx'); const vocoderPath = path.join(onnxDir, 'vocoder.onnx'); const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = await Promise.all([ loadOnnx(dpPath, opts), loadOnnx(textEncPath, opts), loadOnnx(vectorEstPath, opts), loadOnnx(vocoderPath, opts) ]); return { dpOrt, textEncOrt, vectorEstOrt, vocoderOrt }; } /** * Load configuration */ function loadCfgs(onnxDir) { const cfgPath = path.join(onnxDir, 'tts.json'); const cfgs = JSON.parse(fs.readFileSync(cfgPath, 'utf8')); return cfgs; } /** * Load text processor */ function loadTextProcessor(onnxDir) { const unicodeIndexerPath = path.join(onnxDir, 'unicode_indexer.json'); const textProcessor = new UnicodeProcessor(unicodeIndexerPath); return textProcessor; } /** * Load voice style from JSON file */ export function loadVoiceStyle(voiceStylePaths, verbose = false) { const bsz = voiceStylePaths.length; // Read first file to get dimensions const firstStyle = JSON.parse(fs.readFileSync(voiceStylePaths[0], 'utf8')); const ttlDims = firstStyle.style_ttl.dims; const dpDims = firstStyle.style_dp.dims; const ttlDim1 = ttlDims[1]; const ttlDim2 = ttlDims[2]; const dpDim1 = dpDims[1]; const dpDim2 = dpDims[2]; // Pre-allocate arrays with full batch size const ttlSize = bsz * ttlDim1 * ttlDim2; const dpSize = bsz * dpDim1 * dpDim2; const ttlFlat = new Float32Array(ttlSize); const dpFlat = new Float32Array(dpSize); // Fill in the data for (let i = 0; i < bsz; i++) { const voiceStyle = JSON.parse(fs.readFileSync(voiceStylePaths[i], 'utf8')); const ttlData = voiceStyle.style_ttl.data.flat(Infinity); const ttlOffset = i * ttlDim1 * ttlDim2; ttlFlat.set(ttlData, ttlOffset); const dpData = voiceStyle.style_dp.data.flat(Infinity); const dpOffset = i * dpDim1 * dpDim2; dpFlat.set(dpData, dpOffset); } const ttlStyle = new ort.Tensor('float32', ttlFlat, [bsz, ttlDim1, ttlDim2]); const dpStyle = new ort.Tensor('float32', dpFlat, [bsz, dpDim1, dpDim2]); if (verbose) { console.log(`Loaded ${bsz} voice styles`); } return new Style(ttlStyle, dpStyle); } /** * Load text to speech components */ export async function loadTextToSpeech(onnxDir, useGpu = false) { const opts = {}; if (useGpu) { throw new Error('GPU mode is not supported yet'); } else { console.log('Using CPU for inference'); } const cfgs = loadCfgs(onnxDir); const { dpOrt, textEncOrt, vectorEstOrt, vocoderOrt } = await loadOnnxAll(onnxDir, opts); const textProcessor = loadTextProcessor(onnxDir); const textToSpeech = new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt); return textToSpeech; } /** * Convert 3D array to ONNX tensor */ function arrayToTensor(array, dims) { // Flatten the array const flat = array.flat(Infinity); return new ort.Tensor('float32', Float32Array.from(flat), dims); } /** * Convert 2D int array to ONNX tensor */ function intArrayToTensor(array, dims) { const flat = array.flat(Infinity); return new ort.Tensor('int64', BigInt64Array.from(flat.map(x => BigInt(x))), dims); } /** * Write WAV file */ export function writeWavFile(filename, audioData, sampleRate) { const numChannels = 1; const bitsPerSample = 16; const byteRate = sampleRate * numChannels * bitsPerSample / 8; const blockAlign = numChannels * bitsPerSample / 8; const dataSize = audioData.length * bitsPerSample / 8; const buffer = Buffer.alloc(44 + dataSize); // RIFF header buffer.write('RIFF', 0); buffer.writeUInt32LE(36 + dataSize, 4); buffer.write('WAVE', 8); // fmt chunk buffer.write('fmt ', 12); buffer.writeUInt32LE(16, 16); // fmt chunk size buffer.writeUInt16LE(1, 20); // audio format (PCM) buffer.writeUInt16LE(numChannels, 22); buffer.writeUInt32LE(sampleRate, 24); buffer.writeUInt32LE(byteRate, 28); buffer.writeUInt16LE(blockAlign, 32); buffer.writeUInt16LE(bitsPerSample, 34); // data chunk buffer.write('data', 36); buffer.writeUInt32LE(dataSize, 40); // Write audio data for (let i = 0; i < audioData.length; i++) { const sample = Math.max(-1, Math.min(1, audioData[i])); const intSample = Math.floor(sample * 32767); buffer.writeInt16LE(intSample, 44 + i * 2); } fs.writeFileSync(filename, buffer); } /** * Timer utility for measuring execution time */ export async function timer(name, fn) { const start = Date.now(); console.log(`${name}...`); const result = await fn(); const elapsed = ((Date.now() - start) / 1000).toFixed(2); console.log(` -> ${name} completed in ${elapsed} sec`); return result; } /** * Sanitize filename by replacing non-alphanumeric characters with underscores (supports Unicode) */ export function sanitizeFilename(text, maxLen) { const prefix = text.substring(0, maxLen); // \p{L} matches any Unicode letter, \p{N} matches any Unicode number return prefix.replace(/[^\p{L}\p{N}_]/gu, '_'); } /** * Chunk text into manageable segments */ function chunkText(text, maxLen = 300) { if (typeof text !== 'string') { throw new Error(`chunkText expects a string, got ${typeof text}`); } // Split by paragraph (two or more newlines) const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim()); const chunks = []; for (let paragraph of paragraphs) { paragraph = paragraph.trim(); if (!paragraph) continue; // Split by sentence boundaries (period, question mark, exclamation mark followed by space) // But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. const sentences = paragraph.split(/(?