Files
Supertonic/web/helper.js
2026-01-25 18:58:40 +09:00

562 lines
19 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import * as ort from 'onnxruntime-web';
// Available languages for multilingual TTS
export const AVAILABLE_LANGS = ['en', 'ko', 'es', 'pt', 'fr'];
export function isValidLang(lang) {
return AVAILABLE_LANGS.includes(lang);
}
/**
* Unicode Text Processor
*/
export class UnicodeProcessor {
constructor(indexer) {
this.indexer = indexer;
}
call(textList, langList) {
const processedTexts = textList.map((text, i) => this.preprocessText(text, langList[i]));
const textIdsLengths = processedTexts.map(text => text.length);
const maxLen = Math.max(...textIdsLengths);
const textIds = processedTexts.map(text => {
const row = new Array(maxLen).fill(0);
for (let j = 0; j < text.length; j++) {
const codePoint = text.codePointAt(j);
row[j] = (codePoint < this.indexer.length) ? this.indexer[codePoint] : -1;
}
return row;
});
const textMask = this.getTextMask(textIdsLengths);
return { textIds, textMask };
}
preprocessText(text, lang) {
// TODO: Need advanced normalizer for better performance
text = text.normalize('NFKD');
// Remove emojis (wide Unicode range)
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
text = text.replace(emojiPattern, '');
// Replace various dashes and symbols
const replacements = {
'': '-',
'': '-',
'—': '-',
'_': ' ',
'\u201C': '"', // left double quote "
'\u201D': '"', // right double quote "
'\u2018': "'", // left single quote '
'\u2019': "'", // right single quote '
'´': "'",
'`': "'",
'[': ' ',
']': ' ',
'|': ' ',
'/': ' ',
'#': ' ',
'→': ' ',
'←': ' ',
};
for (const [k, v] of Object.entries(replacements)) {
text = text.replaceAll(k, v);
}
// Remove special symbols
text = text.replace(/[♥☆♡©\\]/g, '');
// Replace known expressions
const exprReplacements = {
'@': ' at ',
'e.g.,': 'for example, ',
'i.e.,': 'that is, ',
};
for (const [k, v] of Object.entries(exprReplacements)) {
text = text.replaceAll(k, v);
}
// Fix spacing around punctuation
text = text.replace(/ ,/g, ',');
text = text.replace(/ \./g, '.');
text = text.replace(/ !/g, '!');
text = text.replace(/ \?/g, '?');
text = text.replace(/ ;/g, ';');
text = text.replace(/ :/g, ':');
text = text.replace(/ '/g, "'");
// Remove duplicate quotes
while (text.includes('""')) {
text = text.replace('""', '"');
}
while (text.includes("''")) {
text = text.replace("''", "'");
}
while (text.includes('``')) {
text = text.replace('``', '`');
}
// Remove extra spaces
text = text.replace(/\s+/g, ' ').trim();
// If text doesn't end with punctuation, quotes, or closing brackets, add a period
if (!/[.!?;:,'\"')\]}…。」』】〉》›»]$/.test(text)) {
text += '.';
}
// Validate language
if (!isValidLang(lang)) {
throw new Error(`Invalid language: ${lang}. Available: ${AVAILABLE_LANGS.join(', ')}`);
}
// Wrap text with language tags
text = `<${lang}>${text}</${lang}>`;
return text;
}
getTextMask(textIdsLengths) {
const maxLen = Math.max(...textIdsLengths);
return this.lengthToMask(textIdsLengths, maxLen);
}
lengthToMask(lengths, maxLen = null) {
const actualMaxLen = maxLen || Math.max(...lengths);
return lengths.map(len => {
const row = new Array(actualMaxLen).fill(0.0);
for (let j = 0; j < Math.min(len, actualMaxLen); j++) {
row[j] = 1.0;
}
return [row];
});
}
}
/**
* Style class to hold TTL and DP tensors
*/
export class Style {
constructor(ttlTensor, dpTensor) {
this.ttl = ttlTensor;
this.dp = dpTensor;
}
}
/**
* Text-to-Speech class
*/
export class TextToSpeech {
constructor(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) {
this.cfgs = cfgs;
this.textProcessor = textProcessor;
this.dpOrt = dpOrt;
this.textEncOrt = textEncOrt;
this.vectorEstOrt = vectorEstOrt;
this.vocoderOrt = vocoderOrt;
this.sampleRate = cfgs.ae.sample_rate;
}
async _infer(textList, langList, style, totalStep, speed = 1.05, progressCallback = null) {
const bsz = textList.length;
// Process text
const { textIds, textMask } = this.textProcessor.call(textList, langList);
const textIdsFlat = new BigInt64Array(textIds.flat().map(x => BigInt(x)));
const textIdsShape = [bsz, textIds[0].length];
const textIdsTensor = new ort.Tensor('int64', textIdsFlat, textIdsShape);
const textMaskFlat = new Float32Array(textMask.flat(2));
const textMaskShape = [bsz, 1, textMask[0][0].length];
const textMaskTensor = new ort.Tensor('float32', textMaskFlat, textMaskShape);
// Predict duration
const dpOutputs = await this.dpOrt.run({
text_ids: textIdsTensor,
style_dp: style.dp,
text_mask: textMaskTensor
});
const duration = Array.from(dpOutputs.duration.data);
// Apply speed factor to duration
for (let i = 0; i < duration.length; i++) {
duration[i] /= speed;
}
// Encode text
const textEncOutputs = await this.textEncOrt.run({
text_ids: textIdsTensor,
style_ttl: style.ttl,
text_mask: textMaskTensor
});
const textEmb = textEncOutputs.text_emb;
// Sample noisy latent
let { xt, latentMask } = this.sampleNoisyLatent(
duration,
this.sampleRate,
this.cfgs.ae.base_chunk_size,
this.cfgs.ttl.chunk_compress_factor,
this.cfgs.ttl.latent_dim
);
const latentMaskFlat = new Float32Array(latentMask.flat(2));
const latentMaskShape = [bsz, 1, latentMask[0][0].length];
const latentMaskTensor = new ort.Tensor('float32', latentMaskFlat, latentMaskShape);
// Prepare constant arrays
const totalStepArray = new Float32Array(bsz).fill(totalStep);
const totalStepTensor = new ort.Tensor('float32', totalStepArray, [bsz]);
// Denoising loop
for (let step = 0; step < totalStep; step++) {
if (progressCallback) {
progressCallback(step + 1, totalStep);
}
const currentStepArray = new Float32Array(bsz).fill(step);
const currentStepTensor = new ort.Tensor('float32', currentStepArray, [bsz]);
const xtFlat = new Float32Array(xt.flat(2));
const xtShape = [bsz, xt[0].length, xt[0][0].length];
const xtTensor = new ort.Tensor('float32', xtFlat, xtShape);
const vectorEstOutputs = await this.vectorEstOrt.run({
noisy_latent: xtTensor,
text_emb: textEmb,
style_ttl: style.ttl,
latent_mask: latentMaskTensor,
text_mask: textMaskTensor,
current_step: currentStepTensor,
total_step: totalStepTensor
});
const denoised = Array.from(vectorEstOutputs.denoised_latent.data);
// Reshape to 3D
const latentDim = xt[0].length;
const latentLen = xt[0][0].length;
xt = [];
let idx = 0;
for (let b = 0; b < bsz; b++) {
const batch = [];
for (let d = 0; d < latentDim; d++) {
const row = [];
for (let t = 0; t < latentLen; t++) {
row.push(denoised[idx++]);
}
batch.push(row);
}
xt.push(batch);
}
}
// Generate waveform
const finalXtFlat = new Float32Array(xt.flat(2));
const finalXtShape = [bsz, xt[0].length, xt[0][0].length];
const finalXtTensor = new ort.Tensor('float32', finalXtFlat, finalXtShape);
const vocoderOutputs = await this.vocoderOrt.run({
latent: finalXtTensor
});
const wav = Array.from(vocoderOutputs.wav_tts.data);
return { wav, duration };
}
async call(text, lang, style, totalStep, speed = 1.05, silenceDuration = 0.3, progressCallback = null) {
if (style.ttl.dims[0] !== 1) {
throw new Error('Single speaker text to speech only supports single style');
}
const maxLen = lang === 'ko' ? 120 : 300;
const textList = chunkText(text, maxLen);
const langList = new Array(textList.length).fill(lang);
let wavCat = [];
let durCat = 0;
for (let i = 0; i < textList.length; i++) {
const { wav, duration } = await this._infer([textList[i]], [langList[i]], style, totalStep, speed, progressCallback);
if (wavCat.length === 0) {
wavCat = wav;
durCat = duration[0];
} else {
const silenceLen = Math.floor(silenceDuration * this.sampleRate);
const silence = new Array(silenceLen).fill(0);
wavCat = [...wavCat, ...silence, ...wav];
durCat += duration[0] + silenceDuration;
}
}
return { wav: wavCat, duration: [durCat] };
}
async batch(textList, langList, style, totalStep, speed = 1.05, progressCallback = null) {
return await this._infer(textList, langList, style, totalStep, speed, progressCallback);
}
sampleNoisyLatent(duration, sampleRate, baseChunkSize, chunkCompress, latentDim) {
const bsz = duration.length;
const maxDur = Math.max(...duration);
const wavLenMax = Math.floor(maxDur * sampleRate);
const wavLengths = duration.map(d => Math.floor(d * sampleRate));
const chunkSize = baseChunkSize * chunkCompress;
const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
const latentDimVal = latentDim * chunkCompress;
const xt = [];
for (let b = 0; b < bsz; b++) {
const batch = [];
for (let d = 0; d < latentDimVal; d++) {
const row = [];
for (let t = 0; t < latentLen; t++) {
// Box-Muller transform
const u1 = Math.max(0.0001, Math.random());
const u2 = Math.random();
const val = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2);
row.push(val);
}
batch.push(row);
}
xt.push(batch);
}
const latentLengths = wavLengths.map(len => Math.floor((len + chunkSize - 1) / chunkSize));
const latentMask = this.lengthToMask(latentLengths, latentLen);
// Apply mask
for (let b = 0; b < bsz; b++) {
for (let d = 0; d < latentDimVal; d++) {
for (let t = 0; t < latentLen; t++) {
xt[b][d][t] *= latentMask[b][0][t];
}
}
}
return { xt, latentMask };
}
lengthToMask(lengths, maxLen = null) {
const actualMaxLen = maxLen || Math.max(...lengths);
return lengths.map(len => {
const row = new Array(actualMaxLen).fill(0.0);
for (let j = 0; j < Math.min(len, actualMaxLen); j++) {
row[j] = 1.0;
}
return [row];
});
}
}
/**
* Load voice style from JSON files
*/
export async function loadVoiceStyle(voiceStylePaths, verbose = false) {
const bsz = voiceStylePaths.length;
// Read first file to get dimensions
const firstResponse = await fetch(voiceStylePaths[0]);
const firstStyle = await firstResponse.json();
const ttlDims = firstStyle.style_ttl.dims;
const dpDims = firstStyle.style_dp.dims;
const ttlDim1 = ttlDims[1];
const ttlDim2 = ttlDims[2];
const dpDim1 = dpDims[1];
const dpDim2 = dpDims[2];
// Pre-allocate arrays with full batch size
const ttlSize = bsz * ttlDim1 * ttlDim2;
const dpSize = bsz * dpDim1 * dpDim2;
const ttlFlat = new Float32Array(ttlSize);
const dpFlat = new Float32Array(dpSize);
// Fill in the data
for (let i = 0; i < bsz; i++) {
const response = await fetch(voiceStylePaths[i]);
const voiceStyle = await response.json();
// Flatten TTL data
const ttlData = voiceStyle.style_ttl.data.flat(Infinity);
const ttlOffset = i * ttlDim1 * ttlDim2;
ttlFlat.set(ttlData, ttlOffset);
// Flatten DP data
const dpData = voiceStyle.style_dp.data.flat(Infinity);
const dpOffset = i * dpDim1 * dpDim2;
dpFlat.set(dpData, dpOffset);
}
const ttlShape = [bsz, ttlDim1, ttlDim2];
const dpShape = [bsz, dpDim1, dpDim2];
const ttlTensor = new ort.Tensor('float32', ttlFlat, ttlShape);
const dpTensor = new ort.Tensor('float32', dpFlat, dpShape);
if (verbose) {
console.log(`Loaded ${bsz} voice styles`);
}
return new Style(ttlTensor, dpTensor);
}
/**
* Load configuration from JSON
*/
export async function loadCfgs(onnxDir) {
const response = await fetch(`${onnxDir}/tts.json`);
const cfgs = await response.json();
return cfgs;
}
/**
* Load text processor
*/
export async function loadTextProcessor(onnxDir) {
const response = await fetch(`${onnxDir}/unicode_indexer.json`);
const indexer = await response.json();
return new UnicodeProcessor(indexer);
}
/**
* Load ONNX model
*/
export async function loadOnnx(onnxPath, options) {
const session = await ort.InferenceSession.create(onnxPath, options);
return session;
}
/**
* Load all TTS components
*/
export async function loadTextToSpeech(onnxDir, sessionOptions = {}, progressCallback = null) {
console.log('Using WebAssembly/WebGPU for inference');
const cfgs = await loadCfgs(onnxDir);
const dpPath = `${onnxDir}/duration_predictor.onnx`;
const textEncPath = `${onnxDir}/text_encoder.onnx`;
const vectorEstPath = `${onnxDir}/vector_estimator.onnx`;
const vocoderPath = `${onnxDir}/vocoder.onnx`;
const modelPaths = [
{ name: 'Duration Predictor', path: dpPath },
{ name: 'Text Encoder', path: textEncPath },
{ name: 'Vector Estimator', path: vectorEstPath },
{ name: 'Vocoder', path: vocoderPath }
];
const sessions = [];
for (let i = 0; i < modelPaths.length; i++) {
if (progressCallback) {
progressCallback(modelPaths[i].name, i + 1, modelPaths.length);
}
const session = await loadOnnx(modelPaths[i].path, sessionOptions);
sessions.push(session);
}
const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = sessions;
const textProcessor = await loadTextProcessor(onnxDir);
const textToSpeech = new TextToSpeech(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt);
return { textToSpeech, cfgs };
}
/**
* Chunk text into manageable segments
*/
function chunkText(text, maxLen = 300) {
if (typeof text !== 'string') {
throw new Error(`chunkText expects a string, got ${typeof text}`);
}
// Split by paragraph (two or more newlines)
const paragraphs = text.trim().split(/\n\s*\n+/).filter(p => p.trim());
const chunks = [];
for (let paragraph of paragraphs) {
paragraph = paragraph.trim();
if (!paragraph) continue;
// Split by sentence boundaries (period, question mark, exclamation mark followed by space)
// But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
let currentChunk = "";
for (let sentence of sentences) {
if (currentChunk.length + sentence.length + 1 <= maxLen) {
currentChunk += (currentChunk ? " " : "") + sentence;
} else {
if (currentChunk) {
chunks.push(currentChunk.trim());
}
currentChunk = sentence;
}
}
if (currentChunk) {
chunks.push(currentChunk.trim());
}
}
return chunks;
}
/**
* Write WAV file to ArrayBuffer
*/
export function writeWavFile(audioData, sampleRate) {
const numChannels = 1;
const bitsPerSample = 16;
const byteRate = sampleRate * numChannels * bitsPerSample / 8;
const blockAlign = numChannels * bitsPerSample / 8;
const dataSize = audioData.length * 2;
// Create ArrayBuffer
const buffer = new ArrayBuffer(44 + dataSize);
const view = new DataView(buffer);
// Write WAV header
const writeString = (offset, string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + dataSize, true);
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true); // PCM
view.setUint16(22, numChannels, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, bitsPerSample, true);
writeString(36, 'data');
view.setUint32(40, dataSize, true);
// Write audio data
const int16Data = new Int16Array(audioData.length);
for (let i = 0; i < audioData.length; i++) {
const clamped = Math.max(-1.0, Math.min(1.0, audioData[i]));
int16Data[i] = Math.floor(clamped * 32767);
}
const dataView = new Uint8Array(buffer, 44);
dataView.set(new Uint8Array(int16Data.buffer));
return buffer;
}