560 lines
18 KiB
JavaScript
560 lines
18 KiB
JavaScript
import fs from 'fs';
|
||
import path from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
import * as ort from 'onnxruntime-node';
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
|
||
const AVAILABLE_LANGS = ["en", "ko", "es", "pt", "fr"];
|
||
|
||
/**
|
||
* Unicode text processor
|
||
*/
|
||
class UnicodeProcessor {
|
||
constructor(unicodeIndexerJsonPath) {
|
||
this.indexer = JSON.parse(fs.readFileSync(unicodeIndexerJsonPath, 'utf8'));
|
||
}
|
||
|
||
_preprocessText(text, lang) {
|
||
// TODO: Need advanced normalizer for better performance
|
||
text = text.normalize('NFKD');
|
||
|
||
// Remove emojis (wide Unicode range)
|
||
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
|
||
text = text.replace(emojiPattern, '');
|
||
|
||
// Replace various dashes and symbols
|
||
const replacements = {
|
||
'–': '-',
|
||
'‑': '-',
|
||
'—': '-',
|
||
'_': ' ',
|
||
'\u201C': '"', // left double quote "
|
||
'\u201D': '"', // right double quote "
|
||
'\u2018': "'", // left single quote '
|
||
'\u2019': "'", // right single quote '
|
||
'´': "'",
|
||
'`': "'",
|
||
'[': ' ',
|
||
']': ' ',
|
||
'|': ' ',
|
||
'/': ' ',
|
||
'#': ' ',
|
||
'→': ' ',
|
||
'←': ' ',
|
||
};
|
||
for (const [k, v] of Object.entries(replacements)) {
|
||
text = text.replaceAll(k, v);
|
||
}
|
||
|
||
// Remove special symbols
|
||
text = text.replace(/[♥☆♡©\\]/g, '');
|
||
|
||
// Replace known expressions
|
||
const exprReplacements = {
|
||
'@': ' at ',
|
||
'e.g.,': 'for example, ',
|
||
'i.e.,': 'that is, ',
|
||
};
|
||
for (const [k, v] of Object.entries(exprReplacements)) {
|
||
text = text.replaceAll(k, v);
|
||
}
|
||
|
||
// Fix spacing around punctuation
|
||
text = text.replace(/ ,/g, ',');
|
||
text = text.replace(/ \./g, '.');
|
||
text = text.replace(/ !/g, '!');
|
||
text = text.replace(/ \?/g, '?');
|
||
text = text.replace(/ ;/g, ';');
|
||
text = text.replace(/ :/g, ':');
|
||
text = text.replace(/ '/g, "'");
|
||
|
||
// Remove duplicate quotes
|
||
while (text.includes('""')) {
|
||
text = text.replace('""', '"');
|
||
}
|
||
while (text.includes("''")) {
|
||
text = text.replace("''", "'");
|
||
}
|
||
while (text.includes('``')) {
|
||
text = text.replace('``', '`');
|
||
}
|
||
|
||
// Remove extra spaces
|
||
text = text.replace(/\s+/g, ' ').trim();
|
||
|
||
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
|
||
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
|
||
text += '.';
|
||
}
|
||
|
||
// Validate language
|
||
if (!AVAILABLE_LANGS.includes(lang)) {
|
||
throw new Error(`Invalid language: ${lang}. Available: ${AVAILABLE_LANGS.join(', ')}`);
|
||
}
|
||
|
||
// Wrap text with language tags
|
||
text = `<${lang}>` + text + `</${lang}>`;
|
||
|
||
return text;
|
||
}
|
||
|
||
_textToUnicodeValues(text) {
|
||
return Array.from(text).map(char => char.charCodeAt(0));
|
||
}
|
||
|
||
_getTextMask(textIdsLengths) {
|
||
return lengthToMask(textIdsLengths);
|
||
}
|
||
|
||
call(textList, langList) {
|
||
const processedTexts = textList.map((t, i) => this._preprocessText(t, langList[i]));
|
||
const textIdsLengths = processedTexts.map(t => t.length);
|
||
const maxLen = Math.max(...textIdsLengths);
|
||
|
||
const textIds = [];
|
||
for (let i = 0; i < processedTexts.length; i++) {
|
||
const row = new Array(maxLen).fill(0);
|
||
const unicodeVals = this._textToUnicodeValues(processedTexts[i]);
|
||
for (let j = 0; j < unicodeVals.length; j++) {
|
||
row[j] = this.indexer[unicodeVals[j]];
|
||
}
|
||
textIds.push(row);
|
||
}
|
||
|
||
const textMask = this._getTextMask(textIdsLengths);
|
||
return { textIds, textMask };
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Style class
|
||
*/
|
||
class Style {
|
||
constructor(styleTtlOnnx, styleDpOnnx) {
|
||
this.ttl = styleTtlOnnx;
|
||
this.dp = styleDpOnnx;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* TextToSpeech class
|
||
*/
|
||
class TextToSpeech {
|
||
constructor(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) {
|
||
this.cfgs = cfgs;
|
||
this.textProcessor = textProcessor;
|
||
this.dpOrt = dpOrt;
|
||
this.textEncOrt = textEncOrt;
|
||
this.vectorEstOrt = vectorEstOrt;
|
||
this.vocoderOrt = vocoderOrt;
|
||
this.sampleRate = cfgs.ae.sample_rate;
|
||
this.baseChunkSize = cfgs.ae.base_chunk_size;
|
||
this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor;
|
||
this.ldim = cfgs.ttl.latent_dim;
|
||
}
|
||
|
||
sampleNoisyLatent(duration) {
|
||
const wavLenMax = Math.max(...duration) * this.sampleRate;
|
||
const wavLengths = duration.map(d => Math.floor(d * this.sampleRate));
|
||
const chunkSize = this.baseChunkSize * this.chunkCompressFactor;
|
||
const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
|
||
const latentDim = this.ldim * this.chunkCompressFactor;
|
||
|
||
// Generate random noise
|
||
const noisyLatent = [];
|
||
for (let b = 0; b < duration.length; b++) {
|
||
const batch = [];
|
||
for (let d = 0; d < latentDim; d++) {
|
||
const row = [];
|
||
for (let t = 0; t < latentLen; t++) {
|
||
// Box-Muller transform for normal distribution
|
||
// Add epsilon to avoid log(0)
|
||
const eps = 1e-10;
|
||
const u1 = Math.max(eps, Math.random());
|
||
const u2 = Math.random();
|
||
const randNormal = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2);
|
||
row.push(randNormal);
|
||
}
|
||
batch.push(row);
|
||
}
|
||
noisyLatent.push(batch);
|
||
}
|
||
|
||
const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor);
|
||
|
||
// Apply mask
|
||
for (let b = 0; b < noisyLatent.length; b++) {
|
||
for (let d = 0; d < noisyLatent[b].length; d++) {
|
||
for (let t = 0; t < noisyLatent[b][d].length; t++) {
|
||
noisyLatent[b][d][t] *= latentMask[b][0][t];
|
||
}
|
||
}
|
||
}
|
||
|
||
return { noisyLatent, latentMask };
|
||
}
|
||
|
||
async _infer(textList, langList, style, totalStep, speed = 1.05) {
|
||
if (textList.length !== style.ttl.dims[0]) {
|
||
throw new Error('Number of texts must match number of style vectors');
|
||
}
|
||
const bsz = textList.length;
|
||
const { textIds, textMask } = this.textProcessor.call(textList, langList);
|
||
const textIdsShape = [bsz, textIds[0].length];
|
||
const textMaskShape = [bsz, 1, textMask[0][0].length];
|
||
|
||
const textMaskTensor = arrayToTensor(textMask, textMaskShape);
|
||
|
||
const dpResult = await this.dpOrt.run({
|
||
text_ids: intArrayToTensor(textIds, textIdsShape),
|
||
style_dp: style.dp,
|
||
text_mask: textMaskTensor
|
||
});
|
||
|
||
const durOnnx = Array.from(dpResult.duration.data);
|
||
|
||
// Apply speed factor to duration
|
||
for (let i = 0; i < durOnnx.length; i++) {
|
||
durOnnx[i] /= speed;
|
||
}
|
||
|
||
const textEncResult = await this.textEncOrt.run({
|
||
text_ids: intArrayToTensor(textIds, textIdsShape),
|
||
style_ttl: style.ttl,
|
||
text_mask: textMaskTensor
|
||
});
|
||
|
||
const textEmbTensor = textEncResult.text_emb;
|
||
|
||
let { noisyLatent, latentMask } = this.sampleNoisyLatent(durOnnx);
|
||
const latentShape = [bsz, noisyLatent[0].length, noisyLatent[0][0].length];
|
||
const latentMaskShape = [bsz, 1, latentMask[0][0].length];
|
||
|
||
const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape);
|
||
|
||
const totalStepArray = new Array(bsz).fill(totalStep);
|
||
const scalarShape = [bsz];
|
||
const totalStepTensor = arrayToTensor(totalStepArray, scalarShape);
|
||
|
||
for (let step = 0; step < totalStep; step++) {
|
||
const currentStepArray = new Array(bsz).fill(step);
|
||
|
||
const vectorEstResult = await this.vectorEstOrt.run({
|
||
noisy_latent: arrayToTensor(noisyLatent, latentShape),
|
||
text_emb: textEmbTensor,
|
||
style_ttl: style.ttl,
|
||
text_mask: textMaskTensor,
|
||
latent_mask: latentMaskTensor,
|
||
total_step: totalStepTensor,
|
||
current_step: arrayToTensor(currentStepArray, scalarShape)
|
||
});
|
||
|
||
const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data);
|
||
|
||
// Update latent with the denoised output
|
||
let idx = 0;
|
||
for (let b = 0; b < noisyLatent.length; b++) {
|
||
for (let d = 0; d < noisyLatent[b].length; d++) {
|
||
for (let t = 0; t < noisyLatent[b][d].length; t++) {
|
||
noisyLatent[b][d][t] = denoisedLatent[idx++];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
const vocoderResult = await this.vocoderOrt.run({
|
||
latent: arrayToTensor(noisyLatent, latentShape)
|
||
});
|
||
|
||
const wav = Array.from(vocoderResult.wav_tts.data);
|
||
return { wav, duration: durOnnx };
|
||
}
|
||
|
||
async call(text, lang, style, totalStep, speed = 1.05, silenceDuration = 0.3) {
|
||
if (style.ttl.dims[0] !== 1) {
|
||
throw new Error('Single speaker text to speech only supports single style');
|
||
}
|
||
const maxLen = lang === 'ko' ? 120 : 300;
|
||
const textList = chunkText(text, maxLen);
|
||
let wavCat = null;
|
||
let durCat = 0;
|
||
|
||
for (const chunk of textList) {
|
||
const { wav, duration } = await this._infer([chunk], [lang], style, totalStep, speed);
|
||
|
||
if (wavCat === null) {
|
||
wavCat = wav;
|
||
durCat = duration[0];
|
||
} else {
|
||
const silenceLen = Math.floor(silenceDuration * this.sampleRate);
|
||
const silence = new Array(silenceLen).fill(0);
|
||
wavCat = [...wavCat, ...silence, ...wav];
|
||
durCat += duration[0] + silenceDuration;
|
||
}
|
||
}
|
||
|
||
return { wav: wavCat, duration: [durCat] };
|
||
}
|
||
|
||
async batch(textList, langList, style, totalStep, speed = 1.05) {
|
||
return await this._infer(textList, langList, style, totalStep, speed);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Convert lengths to binary mask
|
||
*/
|
||
function lengthToMask(lengths, maxLen = null) {
|
||
maxLen = maxLen || Math.max(...lengths);
|
||
const mask = [];
|
||
for (let i = 0; i < lengths.length; i++) {
|
||
const row = [];
|
||
for (let j = 0; j < maxLen; j++) {
|
||
row.push(j < lengths[i] ? 1.0 : 0.0);
|
||
}
|
||
mask.push([row]); // [B, 1, maxLen]
|
||
}
|
||
return mask;
|
||
}
|
||
|
||
/**
|
||
* Get latent mask from wav lengths
|
||
*/
|
||
function getLatentMask(wavLengths, baseChunkSize, chunkCompressFactor) {
|
||
const latentSize = baseChunkSize * chunkCompressFactor;
|
||
const latentLengths = wavLengths.map(len =>
|
||
Math.floor((len + latentSize - 1) / latentSize)
|
||
);
|
||
return lengthToMask(latentLengths);
|
||
}
|
||
|
||
/**
|
||
* Load ONNX model
|
||
*/
|
||
async function loadOnnx(onnxPath, opts) {
|
||
return await ort.InferenceSession.create(onnxPath, opts);
|
||
}
|
||
|
||
/**
|
||
* Load all ONNX models for TTS
|
||
*/
|
||
async function loadOnnxAll(onnxDir, opts) {
|
||
const dpPath = path.join(onnxDir, 'duration_predictor.onnx');
|
||
const textEncPath = path.join(onnxDir, 'text_encoder.onnx');
|
||
const vectorEstPath = path.join(onnxDir, 'vector_estimator.onnx');
|
||
const vocoderPath = path.join(onnxDir, 'vocoder.onnx');
|
||
|
||
const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = await Promise.all([
|
||
loadOnnx(dpPath, opts),
|
||
loadOnnx(textEncPath, opts),
|
||
loadOnnx(vectorEstPath, opts),
|
||
loadOnnx(vocoderPath, opts)
|
||
]);
|
||
|
||
return { dpOrt, textEncOrt, vectorEstOrt, vocoderOrt };
|
||
}
|
||
|
||
/**
|
||
* Load configuration
|
||
*/
|
||
function loadCfgs(onnxDir) {
|
||
const cfgPath = path.join(onnxDir, 'tts.json');
|
||
const cfgs = JSON.parse(fs.readFileSync(cfgPath, 'utf8'));
|
||
return cfgs;
|
||
}
|
||
|
||
/**
|
||
* Load text processor
|
||
*/
|
||
function loadTextProcessor(onnxDir) {
|
||
const unicodeIndexerPath = path.join(onnxDir, 'unicode_indexer.json');
|
||
const textProcessor = new UnicodeProcessor(unicodeIndexerPath);
|
||
return textProcessor;
|
||
}
|
||
|
||
/**
|
||
* Load voice style from JSON file
|
||
*/
|
||
export function loadVoiceStyle(voiceStylePaths, verbose = false) {
|
||
const bsz = voiceStylePaths.length;
|
||
|
||
// Read first file to get dimensions
|
||
const firstStyle = JSON.parse(fs.readFileSync(voiceStylePaths[0], 'utf8'));
|
||
const ttlDims = firstStyle.style_ttl.dims;
|
||
const dpDims = firstStyle.style_dp.dims;
|
||
|
||
const ttlDim1 = ttlDims[1];
|
||
const ttlDim2 = ttlDims[2];
|
||
const dpDim1 = dpDims[1];
|
||
const dpDim2 = dpDims[2];
|
||
|
||
// Pre-allocate arrays with full batch size
|
||
const ttlSize = bsz * ttlDim1 * ttlDim2;
|
||
const dpSize = bsz * dpDim1 * dpDim2;
|
||
const ttlFlat = new Float32Array(ttlSize);
|
||
const dpFlat = new Float32Array(dpSize);
|
||
|
||
// Fill in the data
|
||
for (let i = 0; i < bsz; i++) {
|
||
const voiceStyle = JSON.parse(fs.readFileSync(voiceStylePaths[i], 'utf8'));
|
||
|
||
const ttlData = voiceStyle.style_ttl.data.flat(Infinity);
|
||
const ttlOffset = i * ttlDim1 * ttlDim2;
|
||
ttlFlat.set(ttlData, ttlOffset);
|
||
|
||
const dpData = voiceStyle.style_dp.data.flat(Infinity);
|
||
const dpOffset = i * dpDim1 * dpDim2;
|
||
dpFlat.set(dpData, dpOffset);
|
||
}
|
||
|
||
const ttlStyle = new ort.Tensor('float32', ttlFlat, [bsz, ttlDim1, ttlDim2]);
|
||
const dpStyle = new ort.Tensor('float32', dpFlat, [bsz, dpDim1, dpDim2]);
|
||
|
||
if (verbose) {
|
||
console.log(`Loaded ${bsz} voice styles`);
|
||
}
|
||
|
||
return new Style(ttlStyle, dpStyle);
|
||
}
|
||
|
||
/**
|
||
* Load text to speech components
|
||
*/
|
||
export async function loadTextToSpeech(onnxDir, useGpu = false) {
|
||
const opts = {};
|
||
if (useGpu) {
|
||
throw new Error('GPU mode is not supported yet');
|
||
} else {
|
||
console.log('Using CPU for inference');
|
||
}
|
||
|
||
const cfgs = loadCfgs(onnxDir);
|
||
const { dpOrt, textEncOrt, vectorEstOrt, vocoderOrt } = await loadOnnxAll(onnxDir, opts);
|
||
const textProcessor = loadTextProcessor(onnxDir);
|
||
const textToSpeech = new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt);
|
||
|
||
return textToSpeech;
|
||
}
|
||
|
||
/**
|
||
* Convert 3D array to ONNX tensor
|
||
*/
|
||
function arrayToTensor(array, dims) {
|
||
// Flatten the array
|
||
const flat = array.flat(Infinity);
|
||
return new ort.Tensor('float32', Float32Array.from(flat), dims);
|
||
}
|
||
|
||
/**
|
||
* Convert 2D int array to ONNX tensor
|
||
*/
|
||
function intArrayToTensor(array, dims) {
|
||
const flat = array.flat(Infinity);
|
||
return new ort.Tensor('int64', BigInt64Array.from(flat.map(x => BigInt(x))), dims);
|
||
}
|
||
|
||
/**
|
||
* Write WAV file
|
||
*/
|
||
export function writeWavFile(filename, audioData, sampleRate) {
|
||
const numChannels = 1;
|
||
const bitsPerSample = 16;
|
||
const byteRate = sampleRate * numChannels * bitsPerSample / 8;
|
||
const blockAlign = numChannels * bitsPerSample / 8;
|
||
const dataSize = audioData.length * bitsPerSample / 8;
|
||
|
||
const buffer = Buffer.alloc(44 + dataSize);
|
||
|
||
// RIFF header
|
||
buffer.write('RIFF', 0);
|
||
buffer.writeUInt32LE(36 + dataSize, 4);
|
||
buffer.write('WAVE', 8);
|
||
|
||
// fmt chunk
|
||
buffer.write('fmt ', 12);
|
||
buffer.writeUInt32LE(16, 16); // fmt chunk size
|
||
buffer.writeUInt16LE(1, 20); // audio format (PCM)
|
||
buffer.writeUInt16LE(numChannels, 22);
|
||
buffer.writeUInt32LE(sampleRate, 24);
|
||
buffer.writeUInt32LE(byteRate, 28);
|
||
buffer.writeUInt16LE(blockAlign, 32);
|
||
buffer.writeUInt16LE(bitsPerSample, 34);
|
||
|
||
// data chunk
|
||
buffer.write('data', 36);
|
||
buffer.writeUInt32LE(dataSize, 40);
|
||
|
||
// Write audio data
|
||
for (let i = 0; i < audioData.length; i++) {
|
||
const sample = Math.max(-1, Math.min(1, audioData[i]));
|
||
const intSample = Math.floor(sample * 32767);
|
||
buffer.writeInt16LE(intSample, 44 + i * 2);
|
||
}
|
||
|
||
fs.writeFileSync(filename, buffer);
|
||
}
|
||
|
||
/**
|
||
* Timer utility for measuring execution time
|
||
*/
|
||
export async function timer(name, fn) {
|
||
const start = Date.now();
|
||
console.log(`${name}...`);
|
||
const result = await fn();
|
||
const elapsed = ((Date.now() - start) / 1000).toFixed(2);
|
||
console.log(` -> ${name} completed in ${elapsed} sec`);
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Sanitize filename by replacing non-alphanumeric characters with underscores (supports Unicode)
|
||
*/
|
||
export function sanitizeFilename(text, maxLen) {
|
||
const prefix = text.substring(0, maxLen);
|
||
// \p{L} matches any Unicode letter, \p{N} matches any Unicode number
|
||
return prefix.replace(/[^\p{L}\p{N}_]/gu, '_');
|
||
}
|
||
|
||
/**
|
||
* Chunk text into manageable segments
|
||
*/
|
||
function chunkText(text, maxLen = 300) {
|
||
if (typeof text !== 'string') {
|
||
throw new Error(`chunkText expects a string, got ${typeof text}`);
|
||
}
|
||
|
||
// Split by paragraph (two or more newlines)
|
||
const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim());
|
||
|
||
const chunks = [];
|
||
|
||
for (let paragraph of paragraphs) {
|
||
paragraph = paragraph.trim();
|
||
if (!paragraph) continue;
|
||
|
||
// Split by sentence boundaries (period, question mark, exclamation mark followed by space)
|
||
// But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
|
||
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
|
||
|
||
let currentChunk = "";
|
||
|
||
for (let sentence of sentences) {
|
||
if (currentChunk.length + sentence.length + 1 <= maxLen) {
|
||
currentChunk += (currentChunk ? " " : "") + sentence;
|
||
} else {
|
||
if (currentChunk) {
|
||
chunks.push(currentChunk.trim());
|
||
}
|
||
currentChunk = sentence;
|
||
}
|
||
}
|
||
|
||
if (currentChunk) {
|
||
chunks.push(currentChunk.trim());
|
||
}
|
||
}
|
||
|
||
return chunks;
|
||
}
|