import * as ort from 'onnxruntime-web'; // Available languages for multilingual TTS export const AVAILABLE_LANGS = ['en', 'ko', 'es', 'pt', 'fr']; export function isValidLang(lang) { return AVAILABLE_LANGS.includes(lang); } /** * Unicode Text Processor */ export class UnicodeProcessor { constructor(indexer) { this.indexer = indexer; } call(textList, langList) { const processedTexts = textList.map((text, i) => this.preprocessText(text, langList[i])); const textIdsLengths = processedTexts.map(text => text.length); const maxLen = Math.max(...textIdsLengths); const textIds = processedTexts.map(text => { const row = new Array(maxLen).fill(0); for (let j = 0; j < text.length; j++) { const codePoint = text.codePointAt(j); row[j] = (codePoint < this.indexer.length) ? this.indexer[codePoint] : -1; } return row; }); const textMask = this.getTextMask(textIdsLengths); return { textIds, textMask }; } preprocessText(text, lang) { // TODO: Need advanced normalizer for better performance text = text.normalize('NFKD'); // Remove emojis (wide Unicode range) const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu; text = text.replace(emojiPattern, ''); // Replace various dashes and symbols const replacements = { '–': '-', '‑': '-', '—': '-', '_': ' ', '\u201C': '"', // left double quote " '\u201D': '"', // right double quote " '\u2018': "'", // left single quote ' '\u2019': "'", // right single quote ' '´': "'", '`': "'", '[': ' ', ']': ' ', '|': ' ', '/': ' ', '#': ' ', '→': ' ', '←': ' ', }; for (const [k, v] of Object.entries(replacements)) { text = text.replaceAll(k, v); } // Remove special symbols text = text.replace(/[♥☆♡©\\]/g, ''); // Replace known expressions const exprReplacements = { '@': ' at ', 'e.g.,': 'for example, ', 'i.e.,': 'that is, ', }; for (const [k, v] of Object.entries(exprReplacements)) { text = text.replaceAll(k, v); } // Fix spacing around punctuation text = text.replace(/ ,/g, ','); text = text.replace(/ \./g, '.'); text = text.replace(/ !/g, '!'); text = text.replace(/ \?/g, '?'); text = text.replace(/ ;/g, ';'); text = text.replace(/ :/g, ':'); text = text.replace(/ '/g, "'"); // Remove duplicate quotes while (text.includes('""')) { text = text.replace('""', '"'); } while (text.includes("''")) { text = text.replace("''", "'"); } while (text.includes('``')) { text = text.replace('``', '`'); } // Remove extra spaces text = text.replace(/\s+/g, ' ').trim(); // If text doesn't end with punctuation, quotes, or closing brackets, add a period if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) { text += '.'; } // Validate language if (!isValidLang(lang)) { throw new Error(`Invalid language: ${lang}. Available: ${AVAILABLE_LANGS.join(', ')}`); } // Wrap text with language tags text = `<${lang}>${text}`; return text; } getTextMask(textIdsLengths) { const maxLen = Math.max(...textIdsLengths); return this.lengthToMask(textIdsLengths, maxLen); } lengthToMask(lengths, maxLen = null) { const actualMaxLen = maxLen || Math.max(...lengths); return lengths.map(len => { const row = new Array(actualMaxLen).fill(0.0); for (let j = 0; j < Math.min(len, actualMaxLen); j++) { row[j] = 1.0; } return [row]; }); } } /** * Style class to hold TTL and DP tensors */ export class Style { constructor(ttlTensor, dpTensor) { this.ttl = ttlTensor; this.dp = dpTensor; } } /** * Text-to-Speech class */ export class TextToSpeech { constructor(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) { this.cfgs = cfgs; this.textProcessor = textProcessor; this.dpOrt = dpOrt; this.textEncOrt = textEncOrt; this.vectorEstOrt = vectorEstOrt; this.vocoderOrt = vocoderOrt; this.sampleRate = cfgs.ae.sample_rate; } async _infer(textList, langList, style, totalStep, speed = 1.05, progressCallback = null) { const bsz = textList.length; // Process text const { textIds, textMask } = this.textProcessor.call(textList, langList); const textIdsFlat = new BigInt64Array(textIds.flat().map(x => BigInt(x))); const textIdsShape = [bsz, textIds[0].length]; const textIdsTensor = new ort.Tensor('int64', textIdsFlat, textIdsShape); const textMaskFlat = new Float32Array(textMask.flat(2)); const textMaskShape = [bsz, 1, textMask[0][0].length]; const textMaskTensor = new ort.Tensor('float32', textMaskFlat, textMaskShape); // Predict duration const dpOutputs = await this.dpOrt.run({ text_ids: textIdsTensor, style_dp: style.dp, text_mask: textMaskTensor }); const duration = Array.from(dpOutputs.duration.data); // Apply speed factor to duration for (let i = 0; i < duration.length; i++) { duration[i] /= speed; } // Encode text const textEncOutputs = await this.textEncOrt.run({ text_ids: textIdsTensor, style_ttl: style.ttl, text_mask: textMaskTensor }); const textEmb = textEncOutputs.text_emb; // Sample noisy latent let { xt, latentMask } = this.sampleNoisyLatent( duration, this.sampleRate, this.cfgs.ae.base_chunk_size, this.cfgs.ttl.chunk_compress_factor, this.cfgs.ttl.latent_dim ); const latentMaskFlat = new Float32Array(latentMask.flat(2)); const latentMaskShape = [bsz, 1, latentMask[0][0].length]; const latentMaskTensor = new ort.Tensor('float32', latentMaskFlat, latentMaskShape); // Prepare constant arrays const totalStepArray = new Float32Array(bsz).fill(totalStep); const totalStepTensor = new ort.Tensor('float32', totalStepArray, [bsz]); // Denoising loop for (let step = 0; step < totalStep; step++) { if (progressCallback) { progressCallback(step + 1, totalStep); } const currentStepArray = new Float32Array(bsz).fill(step); const currentStepTensor = new ort.Tensor('float32', currentStepArray, [bsz]); const xtFlat = new Float32Array(xt.flat(2)); const xtShape = [bsz, xt[0].length, xt[0][0].length]; const xtTensor = new ort.Tensor('float32', xtFlat, xtShape); const vectorEstOutputs = await this.vectorEstOrt.run({ noisy_latent: xtTensor, text_emb: textEmb, style_ttl: style.ttl, latent_mask: latentMaskTensor, text_mask: textMaskTensor, current_step: currentStepTensor, total_step: totalStepTensor }); const denoised = Array.from(vectorEstOutputs.denoised_latent.data); // Reshape to 3D const latentDim = xt[0].length; const latentLen = xt[0][0].length; xt = []; let idx = 0; for (let b = 0; b < bsz; b++) { const batch = []; for (let d = 0; d < latentDim; d++) { const row = []; for (let t = 0; t < latentLen; t++) { row.push(denoised[idx++]); } batch.push(row); } xt.push(batch); } } // Generate waveform const finalXtFlat = new Float32Array(xt.flat(2)); const finalXtShape = [bsz, xt[0].length, xt[0][0].length]; const finalXtTensor = new ort.Tensor('float32', finalXtFlat, finalXtShape); const vocoderOutputs = await this.vocoderOrt.run({ latent: finalXtTensor }); const wav = Array.from(vocoderOutputs.wav_tts.data); return { wav, duration }; } async call(text, lang, style, totalStep, speed = 1.05, silenceDuration = 0.3, progressCallback = null) { if (style.ttl.dims[0] !== 1) { throw new Error('Single speaker text to speech only supports single style'); } const maxLen = lang === 'ko' ? 120 : 300; const textList = chunkText(text, maxLen); const langList = new Array(textList.length).fill(lang); let wavCat = []; let durCat = 0; for (let i = 0; i < textList.length; i++) { const { wav, duration } = await this._infer([textList[i]], [langList[i]], style, totalStep, speed, progressCallback); if (wavCat.length === 0) { wavCat = wav; durCat = duration[0]; } else { const silenceLen = Math.floor(silenceDuration * this.sampleRate); const silence = new Array(silenceLen).fill(0); wavCat = [...wavCat, ...silence, ...wav]; durCat += duration[0] + silenceDuration; } } return { wav: wavCat, duration: [durCat] }; } async batch(textList, langList, style, totalStep, speed = 1.05, progressCallback = null) { return await this._infer(textList, langList, style, totalStep, speed, progressCallback); } sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) { const bsz = duration.length; const maxDur = Math.max(...duration); const wavLenMax = Math.floor(maxDur * sampleRate); const wavLengths = duration.map(d => Math.floor(d * sampleRate)); const chunkSize = baseChunkSize * chunkCompress; const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize); const latentDimVal = latentDim * chunkCompress; const xt = []; for (let b = 0; b < bsz; b++) { const batch = []; for (let d = 0; d < latentDimVal; d++) { const row = []; for (let t = 0; t < latentLen; t++) { // Box-Muller transform const u1 = Math.max(0.0001, Math.random()); const u2 = Math.random(); const val = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2); row.push(val); } batch.push(row); } xt.push(batch); } const latentLengths = wavLengths.map(len => Math.floor((len + chunkSize - 1) / chunkSize)); const latentMask = this.lengthToMask(latentLengths, latentLen); // Apply mask for (let b = 0; b < bsz; b++) { for (let d = 0; d < latentDimVal; d++) { for (let t = 0; t < latentLen; t++) { xt[b][d][t] *= latentMask[b][0][t]; } } } return { xt, latentMask }; } lengthToMask(lengths, maxLen = null) { const actualMaxLen = maxLen || Math.max(...lengths); return lengths.map(len => { const row = new Array(actualMaxLen).fill(0.0); for (let j = 0; j < Math.min(len, actualMaxLen); j++) { row[j] = 1.0; } return [row]; }); } } /** * Load voice style from JSON files */ export async function loadVoiceStyle(voiceStylePaths, verbose = false) { const bsz = voiceStylePaths.length; // Read first file to get dimensions const firstResponse = await fetch(voiceStylePaths[0]); const firstStyle = await firstResponse.json(); const ttlDims = firstStyle.style_ttl.dims; const dpDims = firstStyle.style_dp.dims; const ttlDim1 = ttlDims[1]; const ttlDim2 = ttlDims[2]; const dpDim1 = dpDims[1]; const dpDim2 = dpDims[2]; // Pre-allocate arrays with full batch size const ttlSize = bsz * ttlDim1 * ttlDim2; const dpSize = bsz * dpDim1 * dpDim2; const ttlFlat = new Float32Array(ttlSize); const dpFlat = new Float32Array(dpSize); // Fill in the data for (let i = 0; i < bsz; i++) { const response = await fetch(voiceStylePaths[i]); const voiceStyle = await response.json(); // Flatten TTL data const ttlData = voiceStyle.style_ttl.data.flat(Infinity); const ttlOffset = i * ttlDim1 * ttlDim2; ttlFlat.set(ttlData, ttlOffset); // Flatten DP data const dpData = voiceStyle.style_dp.data.flat(Infinity); const dpOffset = i * dpDim1 * dpDim2; dpFlat.set(dpData, dpOffset); } const ttlShape = [bsz, ttlDim1, ttlDim2]; const dpShape = [bsz, dpDim1, dpDim2]; const ttlTensor = new ort.Tensor('float32', ttlFlat, ttlShape); const dpTensor = new ort.Tensor('float32', dpFlat, dpShape); if (verbose) { console.log(`Loaded ${bsz} voice styles`); } return new Style(ttlTensor, dpTensor); } /** * Load configuration from JSON */ export async function loadCfgs(onnxDir) { const response = await fetch(`${onnxDir}/tts.json`); const cfgs = await response.json(); return cfgs; } /** * Load text processor */ export async function loadTextProcessor(onnxDir) { const response = await fetch(`${onnxDir}/unicode_indexer.json`); const indexer = await response.json(); return new UnicodeProcessor(indexer); } /** * Load ONNX model */ export async function loadOnnx(onnxPath, options) { const session = await ort.InferenceSession.create(onnxPath, options); return session; } /** * Load all TTS components */ export async function loadTextToSpeech(onnxDir, sessionOptions = {}, progressCallback = null) { console.log('Using WebAssembly/WebGPU for inference'); const cfgs = await loadCfgs(onnxDir); const dpPath = `${onnxDir}/duration_predictor.onnx`; const textEncPath = `${onnxDir}/text_encoder.onnx`; const vectorEstPath = `${onnxDir}/vector_estimator.onnx`; const vocoderPath = `${onnxDir}/vocoder.onnx`; const modelPaths = [ { name: 'Duration Predictor', path: dpPath }, { name: 'Text Encoder', path: textEncPath }, { name: 'Vector Estimator', path: vectorEstPath }, { name: 'Vocoder', path: vocoderPath } ]; const sessions = []; for (let i = 0; i < modelPaths.length; i++) { if (progressCallback) { progressCallback(modelPaths[i].name, i + 1, modelPaths.length); } const session = await loadOnnx(modelPaths[i].path, sessionOptions); sessions.push(session); } const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = sessions; const textProcessor = await loadTextProcessor(onnxDir); const textToSpeech = new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt); return { textToSpeech, cfgs }; } /** * Chunk text into manageable segments */ function chunkText(text, maxLen = 300) { if (typeof text !== 'string') { throw new Error(`chunkText expects a string, got ${typeof text}`); } // Split by paragraph (two or more newlines) const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim()); const chunks = []; for (let paragraph of paragraphs) { paragraph = paragraph.trim(); if (!paragraph) continue; // Split by sentence boundaries (period, question mark, exclamation mark followed by space) // But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. const sentences = paragraph.split(/(? { for (let i = 0; i < string.length; i++) { view.setUint8(offset + i, string.charCodeAt(i)); } }; writeString(0, 'RIFF'); view.setUint32(4, 36 + dataSize, true); writeString(8, 'WAVE'); writeString(12, 'fmt '); view.setUint32(16, 16, true); view.setUint16(20, 1, true); // PCM view.setUint16(22, numChannels, true); view.setUint32(24, sampleRate, true); view.setUint32(28, byteRate, true); view.setUint16(32, blockAlign, true); view.setUint16(34, bitsPerSample, true); writeString(36, 'data'); view.setUint32(40, dataSize, true); // Write audio data const int16Data = new Int16Array(audioData.length); for (let i = 0; i < audioData.length; i++) { const clamped = Math.max(-1.0, Math.min(1.0, audioData[i])); int16Data[i] = Math.floor(clamped * 32767); } const dataView = new Uint8Array(buffer, 44); dataView.set(new Uint8Array(int16Data.buffer)); return buffer; }