From 3a1de0bfc16e282e60ec264cf3de4811355a86af Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 17 Jan 2025 11:17:18 +0800 Subject: [PATCH] Add JavaScript (WebAssembly) API for Kokoro TTS models. (#1726) --- .github/scripts/test-nodejs-npm.sh | 10 ++- nodejs-examples/README.md | 16 ++++ nodejs-examples/test-offline-tts-kokoro-en.js | 37 ++++++++ wasm/tts/sherpa-onnx-tts.js | 85 ++++++++++++++++++- wasm/tts/sherpa-onnx-wasm-main-tts.cc | 12 ++- 5 files changed, 154 insertions(+), 6 deletions(-) create mode 100644 nodejs-examples/test-offline-tts-kokoro-en.js diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 96794484..8d17eae4 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -10,7 +10,15 @@ ls -lh ls -lh node_modules # offline tts -# + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 +tar xf kokoro-en-v0_19.tar.bz2 +rm kokoro-en-v0_19.tar.bz2 + +node ./test-offline-tts-kokoro-en.js + +ls -lh + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 rm matcha-icefall-zh-baker.tar.bz2 diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 3db3a295..c544303a 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js In the following, we demonstrate how to run text-to-speech. +## ./test-offline-tts-kokoro-en.js + +[./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use +[kokoro-en-v0_19](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2) +for text-to-speech. + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 +tar xf kokoro-en-v0_19.tar.bz2 +rm kokoro-en-v0_19.tar.bz2 + +node ./test-offline-tts-kokoro-en.js +``` + ## ./test-offline-tts-matcha-zh.js [./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use diff --git a/nodejs-examples/test-offline-tts-kokoro-en.js b/nodejs-examples/test-offline-tts-kokoro-en.js new file mode 100644 index 00000000..1c011d58 --- /dev/null +++ b/nodejs-examples/test-offline-tts-kokoro-en.js @@ -0,0 +1,37 @@ +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineTts() { + let offlineTtsKokoroModelConfig = { + model: './kokoro-en-v0_19/model.onnx', + voices: './kokoro-en-v0_19/voices.bin', + tokens: './kokoro-en-v0_19/tokens.txt', + dataDir: './kokoro-en-v0_19/espeak-ng-data', + lengthScale: 1.0, + }; + let offlineTtsModelConfig = { + offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, + numThreads: 1, + debug: 1, + provider: 'cpu', + }; + + let offlineTtsConfig = { + offlineTtsModelConfig: offlineTtsModelConfig, + maxNumSentences: 1, + }; + + return sherpa_onnx.createOfflineTts(offlineTtsConfig); +} + +const tts = createOfflineTts(); +const speakerId = 0; +const speed = 1.0; +const text = + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' + +const audio = tts.generate({text: text, sid: speakerId, speed: speed}); +tts.save('./test-kokoro-en.wav', audio); +console.log('Saved to test-kokoro-en.wav successfully.'); +tts.free(); diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index 59158ae7..833ee936 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -8,8 +8,12 @@ function freeConfig(config, Module) { freeConfig(config.config, Module) } - if ('config2' in config) { - freeConfig(config.config2, Module) + if ('matcha' in config) { + freeConfig(config.matcha, Module) + } + + if ('kokoro' in config) { + freeConfig(config.kokoro, Module) } Module._free(config.ptr); @@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) { } } +function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { + const modelLen = Module.lengthBytesUTF8(config.model) + 1; + const voicesLen = Module.lengthBytesUTF8(config.voices) + 1; + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; + + const n = modelLen + voicesLen + tokensLen + dataDirLen; + + const buffer = Module._malloc(n); + + const len = 5 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8(config.model || '', buffer + offset, modelLen); + offset += modelLen; + + Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen); + offset += voicesLen; + + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); + offset += tokensLen; + + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); + offset += dataDirLen; + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + offset += modelLen; + + Module.setValue(ptr + 4, buffer + offset, 'i8*'); + offset += voicesLen; + + Module.setValue(ptr + 8, buffer + offset, 'i8*'); + offset += tokensLen; + + Module.setValue(ptr + 12, buffer + offset, 'i8*'); + offset += dataDirLen; + + Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + function initSherpaOnnxOfflineTtsModelConfig(config, Module) { if (!('offlineTtsVitsModelConfig' in config)) { config.offlineTtsVitsModelConfig = { @@ -159,6 +209,16 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { }; } + if (!('offlineTtsKokoroModelConfig' in config)) { + config.offlineTtsKokoroModelConfig = { + model: '', + voices: '', + tokens: '', + lengthScale: 1.0, + dataDir: '', + }; + } + const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( config.offlineTtsVitsModelConfig, Module); @@ -166,7 +226,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig( config.offlineTtsMatchaModelConfig, Module); - const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4; + const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig( + config.offlineTtsKokoroModelConfig, Module); + + const len = vitsModelConfig.len + matchaModelConfig.len + + kokoroModelConfig.len + 3 * 4; + const ptr = Module._malloc(len); let offset = 0; @@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset); offset += matchaModelConfig.len; + Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset); + offset += kokoroModelConfig.len; + return { buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, - config2: matchaModelConfig + matcha: matchaModelConfig, kokoro: kokoroModelConfig, } } @@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) { lengthScale: 1.0, }; + const offlineTtsKokoroModelConfig = { + model: '', + voices: '', + tokens: '', + dataDir: '', + lengthScale: 1.0, + }; + const offlineTtsModelConfig = { offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, + offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, numThreads: 1, debug: 1, provider: 'cpu', diff --git a/wasm/tts/sherpa-onnx-wasm-main-tts.cc b/wasm/tts/sherpa-onnx-wasm-main-tts.cc index 3508b860..f2cd42c5 100644 --- a/wasm/tts/sherpa-onnx-wasm-main-tts.cc +++ b/wasm/tts/sherpa-onnx-wasm-main-tts.cc @@ -15,9 +15,11 @@ extern "C" { static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 5 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + - sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4, + sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + + sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, @@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { auto tts_model_config = &tts_config->model; auto vits_model_config = &tts_model_config->vits; auto matcha_model_config = &tts_model_config->matcha; + auto kokoro = &tts_model_config->kokoro; fprintf(stdout, "----------vits model config----------\n"); fprintf(stdout, "model: %s\n", vits_model_config->model); fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); @@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale); fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir); + fprintf(stdout, "----------kokoro model config----------\n"); + fprintf(stdout, "model: %s\n", kokoro->model); + fprintf(stdout, "voices: %s\n", kokoro->voices); + fprintf(stdout, "tokens: %s\n", kokoro->tokens); + fprintf(stdout, "data_dir: %s\n", kokoro->data_dir); + fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale); + fprintf(stdout, "----------tts model config----------\n"); fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); fprintf(stdout, "debug: %d\n", tts_model_config->debug);