Add C/CXX/JavaScript API for NeMo Canary models (#2357)

This PR introduces support for NeMo Canary models across C, C++, and JavaScript APIs 
by adding new Canary configuration structures, updating bindings, extending examples,
and enhancing CI workflows.

- Add OfflineCanaryModelConfig to all language bindings (C, C++, JS, ETS).
- Implement SetConfig methods and NAPI wrappers for updating recognizer config at runtime.
- Update examples and CI scripts to demonstrate and test NeMo Canary model usage.
This commit is contained in:
Fangjun Kuang
2025-07-07 23:38:04 +08:00
committed by GitHub
parent 0e738c356c
commit df4615ca1d
28 changed files with 750 additions and 80 deletions

View File

@@ -12,7 +12,6 @@ set(exported_functions
SherpaOnnxCreateOnlineRecognizer
SherpaOnnxCreateOnlineStream
SherpaOnnxDecodeOnlineStream
SherpaOnnxDestroyOfflineStreamResultJson
SherpaOnnxDestroyOnlineRecognizer
SherpaOnnxDestroyOnlineRecognizerResult
SherpaOnnxDestroyOnlineStream

View File

@@ -59,6 +59,10 @@ function freeConfig(config, Module) {
freeConfig(config.senseVoice, Module)
}
if ('canary' in config) {
freeConfig(config.canary, Module)
}
if ('lm' in config) {
freeConfig(config.lm, Module)
}
@@ -246,7 +250,7 @@ function initSherpaOnnxOnlineModelConfig(config, Module) {
Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider
offset += 4;
Module.setValue(ptr + offset, config.debug || 0, 'i32');
Module.setValue(ptr + offset, config.debug ?? 1, 'i32');
offset += 4;
Module.setValue(
@@ -692,6 +696,51 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
}
}
function initSherpaOnnxOfflineCanaryModelConfig(config, Module) {
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
const srcLangLen = Module.lengthBytesUTF8(config.srcLang || '') + 1;
const tgtLangLen = Module.lengthBytesUTF8(config.tgtLang || '') + 1;
const n = encoderLen + decoderLen + srcLangLen + tgtLangLen;
const buffer = Module._malloc(n);
const len = 5 * 4; // 4 pointers + 1 int32
const ptr = Module._malloc(len);
let offset = 0;
Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen);
offset += encoderLen;
Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen);
offset += decoderLen;
Module.stringToUTF8(config.srcLang || '', buffer + offset, srcLangLen);
offset += srcLangLen;
Module.stringToUTF8(config.tgtLang || '', buffer + offset, tgtLangLen);
offset += tgtLangLen;
offset = 0;
Module.setValue(ptr, buffer + offset, 'i8*');
offset += encoderLen;
Module.setValue(ptr + 4, buffer + offset, 'i8*');
offset += decoderLen;
Module.setValue(ptr + 8, buffer + offset, 'i8*');
offset += srcLangLen;
Module.setValue(ptr + 12, buffer + offset, 'i8*');
offset += tgtLangLen;
Module.setValue(ptr + 16, config.usePnc ?? 1, 'i32');
return {
buffer: buffer, ptr: ptr, len: len,
}
}
function initSherpaOnnxOfflineMoonshineModelConfig(config, Module) {
const preprocessorLen = Module.lengthBytesUTF8(config.preprocessor || '') + 1;
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
@@ -811,7 +860,7 @@ function initSherpaOnnxOfflineSenseVoiceModelConfig(config, Module) {
Module.setValue(ptr + 4, buffer + offset, 'i8*');
offset += languageLen;
Module.setValue(ptr + 8, config.useInverseTextNormalization || 0, 'i32');
Module.setValue(ptr + 8, config.useInverseTextNormalization ?? 0, 'i32');
return {
buffer: buffer, ptr: ptr, len: len,
@@ -907,6 +956,16 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
};
}
if (!('canary' in config)) {
config.canary = {
encoder: '',
decoder: '',
srcLang: '',
tgtLang: '',
usePnc: 1,
};
}
const transducer =
initSherpaOnnxOfflineTransducerModelConfig(config.transducer, Module);
@@ -936,9 +995,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
const zipformerCtc =
initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module);
const canary = initSherpaOnnxOfflineCanaryModelConfig(config.canary, Module);
const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
dolphin.len + zipformerCtc.len;
dolphin.len + zipformerCtc.len + canary.len;
const ptr = Module._malloc(len);
@@ -1000,7 +1061,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
offset += 4;
Module.setValue(ptr + offset, config.debug || 0, 'i32');
Module.setValue(ptr + offset, config.debug ?? 1, 'i32');
offset += 4;
Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider
@@ -1043,11 +1104,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset);
offset += zipformerCtc.len;
Module._CopyHeap(canary.ptr, canary.len, ptr + offset);
offset += canary.len;
return {
buffer: buffer, ptr: ptr, len: len, transducer: transducer,
paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
dolphin: dolphin, zipformerCtc: zipformerCtc
dolphin: dolphin, zipformerCtc: zipformerCtc, canary: canary,
}
}
@@ -1189,6 +1253,13 @@ class OfflineRecognizer {
this.Module = Module;
}
setConfig(configObj) {
const config =
initSherpaOnnxOfflineRecognizerConfig(configObj, this.Module);
this.Module._SherpaOnnxOfflineRecognizerSetConfig(this.handle, config.ptr);
freeConfig(config, this.Module);
}
free() {
this.Module._SherpaOnnxDestroyOfflineRecognizer(this.handle);
this.handle = 0

View File

@@ -41,6 +41,7 @@ set(exported_functions
SherpaOnnxDestroyOfflineStreamResultJson
SherpaOnnxGetOfflineStreamResult
SherpaOnnxGetOfflineStreamResultAsJson
SherpaOnnxOfflineRecognizerSetConfig
# online kws
SherpaOnnxCreateKeywordSpotter
SherpaOnnxCreateKeywordStream

View File

@@ -21,6 +21,7 @@ static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineMoonshineModelConfig) == 4 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, "");
static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineCanaryModelConfig) == 5 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
@@ -33,7 +34,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) +
sizeof(SherpaOnnxOfflineDolphinModelConfig) +
sizeof(SherpaOnnxOfflineZipformerCtcModelConfig),
sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) +
sizeof(SherpaOnnxOfflineCanaryModelConfig),
"");
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
@@ -80,6 +82,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
auto fire_red_asr = &model_config->fire_red_asr;
auto dolphin = &model_config->dolphin;
auto zipformer_ctc = &model_config->zipformer_ctc;
auto canary = &model_config->canary;
fprintf(stdout, "----------offline transducer model config----------\n");
fprintf(stdout, "encoder: %s\n", transducer->encoder);
@@ -123,6 +126,13 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
fprintf(stdout, "----------offline zipformer ctc model config----------\n");
fprintf(stdout, "model: %s\n", zipformer_ctc->model);
fprintf(stdout, "----------offline NeMo Canary model config----------\n");
fprintf(stdout, "encoder: %s\n", canary->encoder);
fprintf(stdout, "decoder: %s\n", canary->decoder);
fprintf(stdout, "src_lang: %s\n", canary->src_lang);
fprintf(stdout, "tgt_lang: %s\n", canary->tgt_lang);
fprintf(stdout, "use_pnc: %d\n", canary->use_pnc);
fprintf(stdout, "tokens: %s\n", model_config->tokens);
fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
fprintf(stdout, "provider: %s\n", model_config->provider);