Add JavaScript (WebAssembly) API for Kokoro TTS models. (#1726)
This commit is contained in:
10
.github/scripts/test-nodejs-npm.sh
vendored
10
.github/scripts/test-nodejs-npm.sh
vendored
@@ -10,7 +10,15 @@ ls -lh
|
|||||||
ls -lh node_modules
|
ls -lh node_modules
|
||||||
|
|
||||||
# offline tts
|
# offline tts
|
||||||
#
|
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||||
|
tar xf kokoro-en-v0_19.tar.bz2
|
||||||
|
rm kokoro-en-v0_19.tar.bz2
|
||||||
|
|
||||||
|
node ./test-offline-tts-kokoro-en.js
|
||||||
|
|
||||||
|
ls -lh
|
||||||
|
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||||
rm matcha-icefall-zh-baker.tar.bz2
|
rm matcha-icefall-zh-baker.tar.bz2
|
||||||
|
|||||||
@@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js
|
|||||||
|
|
||||||
In the following, we demonstrate how to run text-to-speech.
|
In the following, we demonstrate how to run text-to-speech.
|
||||||
|
|
||||||
|
## ./test-offline-tts-kokoro-en.js
|
||||||
|
|
||||||
|
[./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use
|
||||||
|
[kokoro-en-v0_19](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2)
|
||||||
|
for text-to-speech.
|
||||||
|
|
||||||
|
You can use the following command to run it:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||||
|
tar xf kokoro-en-v0_19.tar.bz2
|
||||||
|
rm kokoro-en-v0_19.tar.bz2
|
||||||
|
|
||||||
|
node ./test-offline-tts-kokoro-en.js
|
||||||
|
```
|
||||||
|
|
||||||
## ./test-offline-tts-matcha-zh.js
|
## ./test-offline-tts-matcha-zh.js
|
||||||
|
|
||||||
[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use
|
[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use
|
||||||
|
|||||||
37
nodejs-examples/test-offline-tts-kokoro-en.js
Normal file
37
nodejs-examples/test-offline-tts-kokoro-en.js
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
const sherpa_onnx = require('sherpa-onnx');
|
||||||
|
|
||||||
|
function createOfflineTts() {
|
||||||
|
let offlineTtsKokoroModelConfig = {
|
||||||
|
model: './kokoro-en-v0_19/model.onnx',
|
||||||
|
voices: './kokoro-en-v0_19/voices.bin',
|
||||||
|
tokens: './kokoro-en-v0_19/tokens.txt',
|
||||||
|
dataDir: './kokoro-en-v0_19/espeak-ng-data',
|
||||||
|
lengthScale: 1.0,
|
||||||
|
};
|
||||||
|
let offlineTtsModelConfig = {
|
||||||
|
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
|
||||||
|
numThreads: 1,
|
||||||
|
debug: 1,
|
||||||
|
provider: 'cpu',
|
||||||
|
};
|
||||||
|
|
||||||
|
let offlineTtsConfig = {
|
||||||
|
offlineTtsModelConfig: offlineTtsModelConfig,
|
||||||
|
maxNumSentences: 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
const tts = createOfflineTts();
|
||||||
|
const speakerId = 0;
|
||||||
|
const speed = 1.0;
|
||||||
|
const text =
|
||||||
|
'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
|
||||||
|
|
||||||
|
const audio = tts.generate({text: text, sid: speakerId, speed: speed});
|
||||||
|
tts.save('./test-kokoro-en.wav', audio);
|
||||||
|
console.log('Saved to test-kokoro-en.wav successfully.');
|
||||||
|
tts.free();
|
||||||
@@ -8,8 +8,12 @@ function freeConfig(config, Module) {
|
|||||||
freeConfig(config.config, Module)
|
freeConfig(config.config, Module)
|
||||||
}
|
}
|
||||||
|
|
||||||
if ('config2' in config) {
|
if ('matcha' in config) {
|
||||||
freeConfig(config.config2, Module)
|
freeConfig(config.matcha, Module)
|
||||||
|
}
|
||||||
|
|
||||||
|
if ('kokoro' in config) {
|
||||||
|
freeConfig(config.kokoro, Module)
|
||||||
}
|
}
|
||||||
|
|
||||||
Module._free(config.ptr);
|
Module._free(config.ptr);
|
||||||
@@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
|
||||||
|
const modelLen = Module.lengthBytesUTF8(config.model) + 1;
|
||||||
|
const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
|
||||||
|
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
|
||||||
|
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
|
||||||
|
|
||||||
|
const n = modelLen + voicesLen + tokensLen + dataDirLen;
|
||||||
|
|
||||||
|
const buffer = Module._malloc(n);
|
||||||
|
|
||||||
|
const len = 5 * 4;
|
||||||
|
const ptr = Module._malloc(len);
|
||||||
|
|
||||||
|
let offset = 0;
|
||||||
|
Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
|
||||||
|
offset += modelLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen);
|
||||||
|
offset += voicesLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
|
||||||
|
offset += tokensLen;
|
||||||
|
|
||||||
|
Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
|
||||||
|
offset += dataDirLen;
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
Module.setValue(ptr, buffer + offset, 'i8*');
|
||||||
|
offset += modelLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 4, buffer + offset, 'i8*');
|
||||||
|
offset += voicesLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 8, buffer + offset, 'i8*');
|
||||||
|
offset += tokensLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 12, buffer + offset, 'i8*');
|
||||||
|
offset += dataDirLen;
|
||||||
|
|
||||||
|
Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');
|
||||||
|
|
||||||
|
return {
|
||||||
|
buffer: buffer, ptr: ptr, len: len,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
||||||
if (!('offlineTtsVitsModelConfig' in config)) {
|
if (!('offlineTtsVitsModelConfig' in config)) {
|
||||||
config.offlineTtsVitsModelConfig = {
|
config.offlineTtsVitsModelConfig = {
|
||||||
@@ -159,6 +209,16 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!('offlineTtsKokoroModelConfig' in config)) {
|
||||||
|
config.offlineTtsKokoroModelConfig = {
|
||||||
|
model: '',
|
||||||
|
voices: '',
|
||||||
|
tokens: '',
|
||||||
|
lengthScale: 1.0,
|
||||||
|
dataDir: '',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
|
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
|
||||||
config.offlineTtsVitsModelConfig, Module);
|
config.offlineTtsVitsModelConfig, Module);
|
||||||
@@ -166,7 +226,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|||||||
const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
|
const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
|
||||||
config.offlineTtsMatchaModelConfig, Module);
|
config.offlineTtsMatchaModelConfig, Module);
|
||||||
|
|
||||||
const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4;
|
const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig(
|
||||||
|
config.offlineTtsKokoroModelConfig, Module);
|
||||||
|
|
||||||
|
const len = vitsModelConfig.len + matchaModelConfig.len +
|
||||||
|
kokoroModelConfig.len + 3 * 4;
|
||||||
|
|
||||||
const ptr = Module._malloc(len);
|
const ptr = Module._malloc(len);
|
||||||
|
|
||||||
let offset = 0;
|
let offset = 0;
|
||||||
@@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|||||||
Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
|
Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
|
||||||
offset += matchaModelConfig.len;
|
offset += matchaModelConfig.len;
|
||||||
|
|
||||||
|
Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset);
|
||||||
|
offset += kokoroModelConfig.len;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
|
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
|
||||||
config2: matchaModelConfig
|
matcha: matchaModelConfig, kokoro: kokoroModelConfig,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) {
|
|||||||
lengthScale: 1.0,
|
lengthScale: 1.0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const offlineTtsKokoroModelConfig = {
|
||||||
|
model: '',
|
||||||
|
voices: '',
|
||||||
|
tokens: '',
|
||||||
|
dataDir: '',
|
||||||
|
lengthScale: 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
const offlineTtsModelConfig = {
|
const offlineTtsModelConfig = {
|
||||||
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
|
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
|
||||||
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
|
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
|
||||||
|
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
|
||||||
numThreads: 1,
|
numThreads: 1,
|
||||||
debug: 1,
|
debug: 1,
|
||||||
provider: 'cpu',
|
provider: 'cpu',
|
||||||
|
|||||||
@@ -15,9 +15,11 @@ extern "C" {
|
|||||||
|
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
|
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
|
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
|
||||||
|
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 5 * 4, "");
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
|
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
|
||||||
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
|
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
|
||||||
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4,
|
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
|
||||||
|
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
|
||||||
"");
|
"");
|
||||||
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
|
||||||
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
|
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
|
||||||
@@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
|||||||
auto tts_model_config = &tts_config->model;
|
auto tts_model_config = &tts_config->model;
|
||||||
auto vits_model_config = &tts_model_config->vits;
|
auto vits_model_config = &tts_model_config->vits;
|
||||||
auto matcha_model_config = &tts_model_config->matcha;
|
auto matcha_model_config = &tts_model_config->matcha;
|
||||||
|
auto kokoro = &tts_model_config->kokoro;
|
||||||
fprintf(stdout, "----------vits model config----------\n");
|
fprintf(stdout, "----------vits model config----------\n");
|
||||||
fprintf(stdout, "model: %s\n", vits_model_config->model);
|
fprintf(stdout, "model: %s\n", vits_model_config->model);
|
||||||
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
|
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
|
||||||
@@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
|
|||||||
fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
|
fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
|
||||||
fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);
|
fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);
|
||||||
|
|
||||||
|
fprintf(stdout, "----------kokoro model config----------\n");
|
||||||
|
fprintf(stdout, "model: %s\n", kokoro->model);
|
||||||
|
fprintf(stdout, "voices: %s\n", kokoro->voices);
|
||||||
|
fprintf(stdout, "tokens: %s\n", kokoro->tokens);
|
||||||
|
fprintf(stdout, "data_dir: %s\n", kokoro->data_dir);
|
||||||
|
fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale);
|
||||||
|
|
||||||
fprintf(stdout, "----------tts model config----------\n");
|
fprintf(stdout, "----------tts model config----------\n");
|
||||||
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
|
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
|
||||||
fprintf(stdout, "debug: %d\n", tts_model_config->debug);
|
fprintf(stdout, "debug: %d\n", tts_model_config->debug);
|
||||||
|
|||||||
Reference in New Issue
Block a user