Add C# and JavaScript (wasm) API for MatchaTTS models (#1682)

This commit is contained in:
Fangjun Kuang
2025-01-05 15:08:19 +08:00
committed by GitHub
parent 1ef9e5ee3a
commit 3eced3e7ee
26 changed files with 677 additions and 88 deletions

View File

@@ -8,6 +8,10 @@ function freeConfig(config, Module) {
freeConfig(config.config, Module)
}
if ('config2' in config) {
freeConfig(config.config2, Module)
}
Module._free(config.ptr);
}
@@ -66,11 +70,103 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
}
}
function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
const acousticModelLen = Module.lengthBytesUTF8(config.acousticModel) + 1;
const vocoderLen = Module.lengthBytesUTF8(config.vocoder) + 1;
const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
const n = acousticModelLen + vocoderLen + lexiconLen + tokensLen +
dataDirLen + dictDirLen;
const buffer = Module._malloc(n);
const len = 8 * 4;
const ptr = Module._malloc(len);
let offset = 0;
Module.stringToUTF8(
config.acousticModel || '', buffer + offset, acousticModelLen);
offset += acousticModelLen;
Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen);
offset += vocoderLen;
Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
offset += lexiconLen;
Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
offset += tokensLen;
Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
offset += dataDirLen;
Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen);
offset += dictDirLen;
offset = 0;
Module.setValue(ptr, buffer + offset, 'i8*');
offset += acousticModelLen;
Module.setValue(ptr + 4, buffer + offset, 'i8*');
offset += vocoderLen;
Module.setValue(ptr + 8, buffer + offset, 'i8*');
offset += lexiconLen;
Module.setValue(ptr + 12, buffer + offset, 'i8*');
offset += tokensLen;
Module.setValue(ptr + 16, buffer + offset, 'i8*');
offset += dataDirLen;
Module.setValue(ptr + 20, config.noiseScale || 0.667, 'float');
Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float');
Module.setValue(ptr + 28, buffer + offset, 'i8*');
offset += dictDirLen;
return {
buffer: buffer, ptr: ptr, len: len,
}
}
function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
if (!('offlineTtsVitsModelConfig' in config)) {
config.offlineTtsVitsModelConfig = {
model: '',
lexicon: '',
tokens: '',
noiseScale: 0.667,
noiseScaleW: 0.8,
lengthScale: 1.0,
dataDir: '',
dictDir: '',
};
}
if (!('offlineTtsMatchaModelConfig' in config)) {
config.offlineTtsMatchaModelConfig = {
acousticModel: '',
vocoder: '',
lexicon: '',
tokens: '',
noiseScale: 0.667,
lengthScale: 1.0,
dataDir: '',
dictDir: '',
};
}
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
config.offlineTtsVitsModelConfig, Module);
const len = vitsModelConfig.len + 3 * 4;
const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
config.offlineTtsMatchaModelConfig, Module);
const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4;
const ptr = Module._malloc(len);
let offset = 0;
@@ -87,9 +183,14 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
const buffer = Module._malloc(providerLen);
Module.stringToUTF8(config.provider, buffer, providerLen);
Module.setValue(ptr + offset, buffer, 'i8*');
offset += 4;
Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
offset += matchaModelConfig.len;
return {
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
config2: matchaModelConfig
}
}
@@ -195,12 +296,26 @@ function createOfflineTts(Module, myConfig) {
noiseScaleW: 0.8,
lengthScale: 1.0,
};
const offlineTtsMatchaModelConfig = {
acousticModel: '',
vocoder: '',
lexicon: '',
tokens: '',
dataDir: '',
dictDir: '',
noiseScale: 0.667,
lengthScale: 1.0,
};
const offlineTtsModelConfig = {
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
numThreads: 1,
debug: 1,
provider: 'cpu',
};
let offlineTtsConfig = {
offlineTtsModelConfig: offlineTtsModelConfig,
ruleFsts: '',

View File

@@ -14,8 +14,10 @@
extern "C" {
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4,
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4,
"");
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
@@ -24,6 +26,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
auto tts_model_config = &tts_config->model;
auto vits_model_config = &tts_model_config->vits;
auto matcha_model_config = &tts_model_config->matcha;
fprintf(stdout, "----------vits model config----------\n");
fprintf(stdout, "model: %s\n", vits_model_config->model);
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
@@ -34,6 +37,16 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale);
fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir);
fprintf(stdout, "----------matcha model config----------\n");
fprintf(stdout, "acoustic_model: %s\n", matcha_model_config->acoustic_model);
fprintf(stdout, "vocoder: %s\n", matcha_model_config->vocoder);
fprintf(stdout, "lexicon: %s\n", matcha_model_config->lexicon);
fprintf(stdout, "tokens: %s\n", matcha_model_config->tokens);
fprintf(stdout, "data_dir: %s\n", matcha_model_config->data_dir);
fprintf(stdout, "noise scale: %.3f\n", matcha_model_config->noise_scale);
fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);
fprintf(stdout, "----------tts model config----------\n");
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
fprintf(stdout, "debug: %d\n", tts_model_config->debug);