diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index ea93abd0..a4f097a0 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -10,6 +10,16 @@ arch=$(node -p "require('os').arch()") platform=$(node -p "require('os').platform()") node_version=$(node -p "process.versions.node.split('.')[0]") +echo "----------non-streaming ASR NeMo Canary----------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + +node ./test_asr_non_streaming_nemo_canary.js + +rm -rf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8 + echo "----------non-streaming ASR Zipformer CTC----------" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 98402bb5..620fdb5b 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -9,6 +9,14 @@ git status ls -lh ls -lh node_modules +# asr with offline nemo canary +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + +node ./test-offline-nemo-canary.js +rm -rf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8 + # asr with offline zipformer ctc curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 diff --git a/.github/scripts/test-online-ctc.sh b/.github/scripts/test-online-ctc.sh index fe5716f0..65cf6ca9 100755 --- a/.github/scripts/test-online-ctc.sh +++ b/.github/scripts/test-online-ctc.sh @@ -77,16 +77,6 @@ time $EXE \ $repo/test_wavs/DEV_T0000000001.wav \ $repo/test_wavs/DEV_T0000000002.wav -log "test int8" - -time $EXE \ - --debug=1 \ - --zipformer2-ctc-model=$repo/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ - --tokens=$repo/tokens.txt \ - $repo/test_wavs/DEV_T0000000000.wav \ - $repo/test_wavs/DEV_T0000000001.wav \ - $repo/test_wavs/DEV_T0000000002.wav - rm -rf $repo log "------------------------------------------------------------" diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index e53e7b67..eafba2ab 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -127,6 +127,36 @@ jobs: rm -rf dict lexicon.txt test-hr.wav replace.fst rm -v $name + - name: Test NeMo Canary + shell: bash + run: | + name=nemo-canary-c-api + gcc -o $name ./c-api-examples/$name.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh $name + + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then + ldd ./$name + echo "----" + readelf -d ./$name + fi + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./$name + + rm $name + rm -rf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8 + - name: Test Dolphin CTC shell: bash run: | diff --git a/.github/workflows/cxx-api.yaml b/.github/workflows/cxx-api.yaml index f8d7b688..6267e535 100644 --- a/.github/workflows/cxx-api.yaml +++ b/.github/workflows/cxx-api.yaml @@ -87,6 +87,40 @@ jobs: otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib fi + - name: Test NeMo Canary + shell: bash + run: | + name=nemo-canary-cxx-api + g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-cxx-api \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh $name + + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then + ldd ./$name + echo "----" + readelf -d ./$name + fi + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + + ls -lh sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8 + echo "---" + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./$name + + rm -rf sherpa-onnx-nemo-canary-* + rm -v ./$name + - name: Test streaming zipformer with Homophone replacer shell: bash run: | diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 9e899a25..4e62d5e5 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -53,6 +53,9 @@ target_link_libraries(whisper-c-api sherpa-onnx-c-api) add_executable(fire-red-asr-c-api fire-red-asr-c-api.c) target_link_libraries(fire-red-asr-c-api sherpa-onnx-c-api) +add_executable(nemo-canary-c-api nemo-canary-c-api.c) +target_link_libraries(nemo-canary-c-api sherpa-onnx-c-api) + add_executable(sense-voice-c-api sense-voice-c-api.c) target_link_libraries(sense-voice-c-api sherpa-onnx-c-api) diff --git a/c-api-examples/nemo-canary-c-api.c b/c-api-examples/nemo-canary-c-api.c new file mode 100644 index 00000000..0aecc7a1 --- /dev/null +++ b/c-api-examples/nemo-canary-c-api.c @@ -0,0 +1,115 @@ +// c-api-examples/nemo-canary-c-api.c +// +// Copyright (c) 2025 Xiaomi Corporation + +// We assume you have pre-downloaded the Nemo Canary model +// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +// An example is given below: +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +// tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +// rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +// +// clang-format on +// +// see https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html +// for details + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = + "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/de.wav"; + const char *encoder_filename = + "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx"; + const char *decoder_filename = + "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx"; + const char *tokens_filename = + "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt"; + const char *provider = "cpu"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + + // set debug to 1 to view more logs + offline_model_config.debug = 0; + + offline_model_config.num_threads = 1; + offline_model_config.provider = provider; + offline_model_config.tokens = tokens_filename; + offline_model_config.canary.encoder = encoder_filename; + offline_model_config.canary.decoder = decoder_filename; + + // so it output punctuations and cases + offline_model_config.canary.use_pnc = 1; + + offline_model_config.canary.src_lang = "de"; + + // since there is a German audio, you can set tgt_lang to en or de + offline_model_config.canary.tgt_lang = "en"; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + + const SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your config!\n"); + + SherpaOnnxFreeWave(wave); + + return -1; + } + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, wave->samples, + wave->num_samples); + SherpaOnnxDecodeOfflineStream(recognizer, stream); + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + fprintf(stderr, "Decoded text (English): %s\n", result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + // now output German text + recognizer_config.model_config.canary.tgt_lang = "de"; + SherpaOnnxOfflineRecognizerSetConfig(recognizer, &recognizer_config); + + stream = SherpaOnnxCreateOfflineStream(recognizer); + + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, wave->samples, + wave->num_samples); + SherpaOnnxDecodeOfflineStream(recognizer, stream); + result = SherpaOnnxGetOfflineStreamResult(stream); + + fprintf(stderr, "Decoded text (German): %s\n", result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxFreeWave(wave); + + return 0; +} diff --git a/c-api-examples/streaming-ctc-buffered-tokens-c-api.c b/c-api-examples/streaming-ctc-buffered-tokens-c-api.c index 98f5b4a6..8c21807c 100644 --- a/c-api-examples/streaming-ctc-buffered-tokens-c-api.c +++ b/c-api-examples/streaming-ctc-buffered-tokens-c-api.c @@ -54,7 +54,7 @@ int32_t main() { "DEV_T0000000000.wav"; const char *model_filename = "sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/" - "ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx"; + "ctc-epoch-20-avg-1-chunk-16-left-128.onnx"; const char *tokens_filename = "sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt"; const char *provider = "cpu"; diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index 4b40d228..bb6c980f 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -27,6 +27,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) +add_executable(nemo-canary-cxx-api ./nemo-canary-cxx-api.cc) +target_link_libraries(nemo-canary-cxx-api sherpa-onnx-cxx-api) + if(SHERPA_ONNX_ENABLE_PORTAUDIO) add_executable(sense-voice-simulate-streaming-microphone-cxx-api ./sense-voice-simulate-streaming-microphone-cxx-api.cc diff --git a/cxx-api-examples/nemo-canary-cxx-api.cc b/cxx-api-examples/nemo-canary-cxx-api.cc new file mode 100644 index 00000000..dbb3879c --- /dev/null +++ b/cxx-api-examples/nemo-canary-cxx-api.cc @@ -0,0 +1,101 @@ +// cxx-api-examples/nemo-canary-cxx-api.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use NeMo Canary models with +// sherpa-onnx's C++ API. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +// tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +// rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +// +// clang-format on +// +// see https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html +// for details + +#include // NOLINT +#include +#include + +#include "sherpa-onnx/c-api/cxx-api.h" + +int32_t main() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.canary.encoder = + "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx"; + config.model_config.canary.decoder = + "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx"; + + // our input audio is German, so we set src_lang to "de" + config.model_config.canary.src_lang = "de"; + + // we can set tgt_lang either to de or en in this specific case + config.model_config.canary.tgt_lang = "en"; + config.model_config.tokens = + "sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt"; + + config.model_config.num_threads = 1; + + std::cout << "Loading model\n"; + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); + if (!recognizer.Get()) { + std::cerr << "Please check your config\n"; + return -1; + } + std::cout << "Loading model done\n"; + + std::string wave_filename = + "./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/de.wav"; + + Wave wave = ReadWave(wave_filename); + if (wave.samples.empty()) { + std::cerr << "Failed to read: '" << wave_filename << "'\n"; + return -1; + } + + std::cout << "Start recognition\n"; + const auto begin = std::chrono::steady_clock::now(); + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), + wave.samples.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + + const auto end = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + float duration = wave.samples.size() / static_cast(wave.sample_rate); + float rtf = elapsed_seconds / duration; + + std::cout << "text (English): " << result.text << "\n"; + printf("Number of threads: %d\n", config.model_config.num_threads); + printf("Duration: %.3fs\n", duration); + printf("Elapsed seconds: %.3fs\n", elapsed_seconds); + printf("(Real time factor) RTF = %.3f / %.3f = %.3f\n", elapsed_seconds, + duration, rtf); + + // now output text in German + config.model_config.canary.tgt_lang = "de"; + recognizer.SetConfig(config); + stream = recognizer.CreateStream(); + stream.AcceptWaveform(wave.sample_rate, wave.samples.data(), + wave.samples.size()); + + recognizer.Decode(&stream); + + result = recognizer.GetResult(&stream); + std::cout << "text (German): " << result.text << "\n"; + + return 0; +} diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets index c165d416..d2ef9259 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets @@ -7,6 +7,7 @@ export { Samples, OfflineStream, FeatureConfig, HomophoneReplacerConfig, + OfflineCanaryModelConfig, OfflineDolphinModelConfig, OfflineTransducerModelConfig, OfflineParaformerModelConfig, diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc index fb86615c..7dec4924 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc @@ -93,6 +93,27 @@ static SherpaOnnxOfflineNemoEncDecCtcModelConfig GetOfflineNeMoCtcModelConfig( return c; } +static SherpaOnnxOfflineCanaryModelConfig GetOfflineCanaryModelConfig( + Napi::Object obj) { + SherpaOnnxOfflineCanaryModelConfig c; + memset(&c, 0, sizeof(c)); + c.use_pnc = 1; // Align default with JS default + + if (!obj.Has("canary") || !obj.Get("canary").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("canary").As(); + + SHERPA_ONNX_ASSIGN_ATTR_STR(encoder, encoder); + SHERPA_ONNX_ASSIGN_ATTR_STR(decoder, decoder); + SHERPA_ONNX_ASSIGN_ATTR_STR(src_lang, srcLang); + SHERPA_ONNX_ASSIGN_ATTR_STR(tgt_lang, tgtLang); + SHERPA_ONNX_ASSIGN_ATTR_INT32(use_pnc, usePnc); + + return c; +} + static SherpaOnnxOfflineWhisperModelConfig GetOfflineWhisperModelConfig( Napi::Object obj) { SherpaOnnxOfflineWhisperModelConfig c; @@ -203,6 +224,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o); c.dolphin = GetOfflineDolphinModelConfig(o); c.zipformer_ctc = GetOfflineZipformerCtcModelConfig(o); + c.canary = GetOfflineCanaryModelConfig(o); SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); @@ -241,6 +263,78 @@ static SherpaOnnxOfflineLMConfig GetOfflineLMConfig(Napi::Object obj) { return c; } +static SherpaOnnxOfflineRecognizerConfig ParseConfig(Napi::Object o) { + SherpaOnnxOfflineRecognizerConfig c; + memset(&c, 0, sizeof(c)); + c.feat_config = GetFeatureConfig(o); + c.model_config = GetOfflineModelConfig(o); + c.lm_config = GetOfflineLMConfig(o); + c.hr = GetHomophoneReplacerConfig(o); + + SHERPA_ONNX_ASSIGN_ATTR_STR(decoding_method, decodingMethod); + SHERPA_ONNX_ASSIGN_ATTR_INT32(max_active_paths, maxActivePaths); + SHERPA_ONNX_ASSIGN_ATTR_STR(hotwords_file, hotwordsFile); + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(hotwords_score, hotwordsScore); + SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); + SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(blank_penalty, blankPenalty); + + return c; +} + +static void FreeConfig(const SherpaOnnxOfflineRecognizerConfig &c) { + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.decoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.joiner); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.paraformer.model); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.nemo_ctc.model); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.decoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.language); + SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.task); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.tdnn.model); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.model); + SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.language); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.preprocessor); + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.uncached_decoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.cached_decoder); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model); + SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.encoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.decoder); + SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.src_lang); + SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.tgt_lang); + + SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); + SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); + SHERPA_ONNX_DELETE_C_STR(c.model_config.model_type); + SHERPA_ONNX_DELETE_C_STR(c.model_config.modeling_unit); + SHERPA_ONNX_DELETE_C_STR(c.model_config.bpe_vocab); + SHERPA_ONNX_DELETE_C_STR(c.model_config.telespeech_ctc); + + SHERPA_ONNX_DELETE_C_STR(c.lm_config.model); + + SHERPA_ONNX_DELETE_C_STR(c.decoding_method); + SHERPA_ONNX_DELETE_C_STR(c.hotwords_file); + SHERPA_ONNX_DELETE_C_STR(c.rule_fsts); + SHERPA_ONNX_DELETE_C_STR(c.rule_fars); + SHERPA_ONNX_DELETE_C_STR(c.hr.dict_dir); + SHERPA_ONNX_DELETE_C_STR(c.hr.lexicon); + SHERPA_ONNX_DELETE_C_STR(c.hr.rule_fsts); +} + static Napi::External CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { Napi::Env env = info.Env(); @@ -274,20 +368,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { Napi::Object o = info[0].As(); - SherpaOnnxOfflineRecognizerConfig c; - memset(&c, 0, sizeof(c)); - c.feat_config = GetFeatureConfig(o); - c.model_config = GetOfflineModelConfig(o); - c.lm_config = GetOfflineLMConfig(o); - c.hr = GetHomophoneReplacerConfig(o); - - SHERPA_ONNX_ASSIGN_ATTR_STR(decoding_method, decodingMethod); - SHERPA_ONNX_ASSIGN_ATTR_INT32(max_active_paths, maxActivePaths); - SHERPA_ONNX_ASSIGN_ATTR_STR(hotwords_file, hotwordsFile); - SHERPA_ONNX_ASSIGN_ATTR_FLOAT(hotwords_score, hotwordsScore); - SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); - SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); - SHERPA_ONNX_ASSIGN_ATTR_FLOAT(blank_penalty, blankPenalty); + SherpaOnnxOfflineRecognizerConfig c = ParseConfig(o); #if __OHOS__ std::unique_ptr(); + SherpaOnnxOfflineRecognizerConfig c = ParseConfig(o); + + const SherpaOnnxOfflineRecognizer *recognizer = + info[0].As>().Data(); + + SherpaOnnxOfflineRecognizerSetConfig(recognizer, &c); + + FreeConfig(c); +} + static void DecodeOfflineStreamWrapper(const Napi::CallbackInfo &info) { Napi::Env env = info.Env(); if (info.Length() != 2) { @@ -548,6 +622,9 @@ void InitNonStreamingAsr(Napi::Env env, Napi::Object exports) { exports.Set(Napi::String::New(env, "decodeOfflineStream"), Napi::Function::New(env, DecodeOfflineStreamWrapper)); + exports.Set(Napi::String::New(env, "offlineRecognizerSetConfig"), + Napi::Function::New(env, OfflineRecognizerSetConfigWrapper)); + exports.Set(Napi::String::New(env, "getOfflineStreamResultAsJson"), Napi::Function::New(env, GetOfflineStreamResultAsJsonWrapper)); } diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts index a51b9ed2..2c54471d 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/types/libsherpa_onnx/Index.d.ts @@ -22,6 +22,7 @@ export const voiceActivityDetectorFlush: (handle: object) => void; export const createOfflineRecognizer: (config: object, mgr?: object) => object; export const createOfflineStream: (handle: object) => object; +export const offlineRecognizerSetConfig: (handle: object, config: object) => void; export const acceptWaveformOffline: (handle: object, audio: object) => void; export const decodeOfflineStream: (handle: object, streamHandle: object) => void; export const getOfflineStreamResultAsJson: (streamHandle: object) => string; diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets index b437a163..d145c64b 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets @@ -4,6 +4,7 @@ import { createOfflineStream, decodeOfflineStream, getOfflineStreamResultAsJson, + offlineRecognizerSetConfig, } from 'libsherpa_onnx.so'; export interface Samples { @@ -67,6 +68,14 @@ export class OfflineWhisperModelConfig { public tailPaddings: number = -1; } +export class OfflineCanaryModelConfig { + public encoder: string = ''; + public decoder: string = ''; + public srcLang: string = ''; + public tgtLang: string = ''; + public usePnc: number = 1; +} + export class OfflineTdnnModelConfig { public model: string = ''; } @@ -102,6 +111,7 @@ export class OfflineModelConfig { public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig(); public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig(); public zipformerCtc: OfflineZipformerCtcModelConfig = new OfflineZipformerCtcModelConfig(); + public canary: OfflineCanaryModelConfig = new OfflineCanaryModelConfig(); } export class OfflineLMConfig { @@ -151,6 +161,10 @@ export class OfflineRecognizer { this.config = config } + setConfig(config: OfflineRecognizerConfig) { + offlineRecognizerSetConfig(this.handle, config); + } + createStream(): OfflineStream { const handle: object = createOfflineStream(this.handle); return new OfflineStream(handle); diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index b4874a39..6ccdb1c3 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -123,6 +123,7 @@ The following tables list the examples in this folder. |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| +|[./test_asr_non_streaming_nemo_canary.js](./test_asr_non_streaming_nemo_canary.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [Canary](https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html#sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8-english-spanish-german-french) model| |[./test_asr_non_streaming_zipformer_ctc.js](./test_asr_non_streaming_zipformer_ctc.js)|Non-streaming speech recognition from a file using a Zipformer CTC model with greedy search| |[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search| |[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search| @@ -389,6 +390,16 @@ npm install naudiodon2 node ./test_vad_asr_non_streaming_zipformer_ctc_microphone.js ``` +### Non-streaming speech recognition with NeMo Canary models + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + +node ./test_asr_non_streaming_nemo_canary.js +``` + ### Non-streaming speech recognition with NeMo CTC models ```bash diff --git a/nodejs-addon-examples/test_asr_non_streaming_nemo_canary.js b/nodejs-addon-examples/test_asr_non_streaming_nemo_canary.js new file mode 100644 index 00000000..05668fba --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_nemo_canary.js @@ -0,0 +1,62 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'canary': { + 'encoder': + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx', + 'decoder': + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx', + 'srcLang': 'en', + 'tgtLang': 'en', + 'usePnc': 1, + }, + 'tokens': + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 0, + } +}; + +const waveFilename = + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = Date.now(); +let stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +let result = recognizer.getResult(stream) +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'seconds') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result (English)\n', result) + +stream = recognizer.createStream(); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); +recognizer.config.modelConfig.canary.tgtLang = 'de'; +recognizer.setConfig(recognizer.config); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +console.log('result (German)\n', result) diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 02df62f0..6ab7bfb7 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -63,7 +63,7 @@ for text-to-speech. You can use the following command to run it: ```bash -wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 tar xf kokoro-en-v0_19.tar.bz2 rm kokoro-en-v0_19.tar.bz2 @@ -154,6 +154,22 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 node ./test-offline-dolphin-ctc.js ``` +## ./test-offline-nemo-canary.js + +[./test-offline-nemo-canary.js](./test-offline-nemo-canary.js) demonstrates +how to decode a file with a NeMo Canary model. In the code we use +[sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8](https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html#sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8-english-spanish-german-french). + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 +rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2 + +node ./test-offline-nemo-canary.js +``` + ## ./test-offline-zipformer-ctc.js [./test-offline-zipformer-ctc.js](./test-offline-zipformer-ctc.js) demonstrates diff --git a/nodejs-examples/test-offline-nemo-canary.js b/nodejs-examples/test-offline-nemo-canary.js new file mode 100644 index 00000000..0af2167d --- /dev/null +++ b/nodejs-examples/test-offline-nemo-canary.js @@ -0,0 +1,56 @@ +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineRecognizer() { + let config = { + modelConfig: { + canary: { + encoder: + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx', + decoder: + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx', + srcLang: 'en', + tgtLang: 'en', + usePnc: 1, + }, + debug: 0, + tokens: + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt', + } + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + +const recognizer = createOfflineRecognizer(); +let stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); + +recognizer.decode(stream); +let text = recognizer.getResult(stream).text; +console.log(`text in English: ${text}`); + +stream.free(); + +// now output German text +recognizer.config.modelConfig.canary.tgtLang = 'de'; +recognizer.setConfig(recognizer.config); + +stream = recognizer.createStream(); +stream.acceptWaveform(wave.sampleRate, wave.samples); +recognizer.decode(stream); +text = recognizer.getResult(stream).text; + +console.log(`text in German: ${text}`); + +stream.free(); +recognizer.free(); diff --git a/scripts/node-addon-api/lib/non-streaming-asr.js b/scripts/node-addon-api/lib/non-streaming-asr.js index c7721906..1b9c40f1 100644 --- a/scripts/node-addon-api/lib/non-streaming-asr.js +++ b/scripts/node-addon-api/lib/non-streaming-asr.js @@ -24,6 +24,10 @@ class OfflineRecognizer { return new OfflineStream(handle); } + setConfig(config) { + addon.offlineRecognizerSetConfig(this.handle, config); + } + decode(stream) { addon.decodeOfflineStream(this.handle, stream.handle); } diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 06c3b061..31a6d24e 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -487,6 +487,21 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig( recognizer_config.model_config.zipformer_ctc.model = SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, ""); + recognizer_config.model_config.canary.encoder = + SHERPA_ONNX_OR(config->model_config.canary.encoder, ""); + + recognizer_config.model_config.canary.decoder = + SHERPA_ONNX_OR(config->model_config.canary.decoder, ""); + + recognizer_config.model_config.canary.src_lang = + SHERPA_ONNX_OR(config->model_config.canary.src_lang, ""); + + recognizer_config.model_config.canary.tgt_lang = + SHERPA_ONNX_OR(config->model_config.canary.tgt_lang, ""); + + recognizer_config.model_config.canary.use_pnc = + config->model_config.canary.use_pnc; + recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, ""); recognizer_config.lm_config.scale = diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 5c96fe70..2303ee46 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -420,6 +420,14 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineWhisperModelConfig { int32_t tail_paddings; } SherpaOnnxOfflineWhisperModelConfig; +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineCanaryModelConfig { + const char *encoder; + const char *decoder; + const char *src_lang; + const char *tgt_lang; + int32_t use_pnc; +} SherpaOnnxOfflineCanaryModelConfig; + SHERPA_ONNX_API typedef struct SherpaOnnxOfflineFireRedAsrModelConfig { const char *encoder; const char *decoder; @@ -479,6 +487,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr; SherpaOnnxOfflineDolphinModelConfig dolphin; SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc; + SherpaOnnxOfflineCanaryModelConfig canary; } SherpaOnnxOfflineModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index ec4e9e5d..5f6baf21 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -193,7 +193,7 @@ void OfflineStream::AcceptWaveform(int32_t sample_rate, const float *samples, SherpaOnnxAcceptWaveformOffline(p_, sample_rate, samples, n); } -OfflineRecognizer OfflineRecognizer::Create( +static SherpaOnnxOfflineRecognizerConfig Convert( const OfflineRecognizerConfig &config) { struct SherpaOnnxOfflineRecognizerConfig c; memset(&c, 0, sizeof(c)); @@ -256,6 +256,12 @@ OfflineRecognizer OfflineRecognizer::Create( c.model_config.zipformer_ctc.model = config.model_config.zipformer_ctc.model.c_str(); + c.model_config.canary.encoder = config.model_config.canary.encoder.c_str(); + c.model_config.canary.decoder = config.model_config.canary.decoder.c_str(); + c.model_config.canary.src_lang = config.model_config.canary.src_lang.c_str(); + c.model_config.canary.tgt_lang = config.model_config.canary.tgt_lang.c_str(); + c.model_config.canary.use_pnc = config.model_config.canary.use_pnc; + c.lm_config.model = config.lm_config.model.c_str(); c.lm_config.scale = config.lm_config.scale; @@ -273,10 +279,22 @@ OfflineRecognizer OfflineRecognizer::Create( c.hr.lexicon = config.hr.lexicon.c_str(); c.hr.rule_fsts = config.hr.rule_fsts.c_str(); + return c; +} + +OfflineRecognizer OfflineRecognizer::Create( + const OfflineRecognizerConfig &config) { + auto c = Convert(config); + auto p = SherpaOnnxCreateOfflineRecognizer(&c); return OfflineRecognizer(p); } +void OfflineRecognizer::SetConfig(const OfflineRecognizerConfig &config) const { + auto c = Convert(config); + SherpaOnnxOfflineRecognizerSetConfig(p_, &c); +} + OfflineRecognizer::OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p) : MoveOnly(p) {} diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index ea1c5961..c8b0f1ca 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -223,6 +223,14 @@ struct SHERPA_ONNX_API OfflineWhisperModelConfig { int32_t tail_paddings = -1; }; +struct SHERPA_ONNX_API OfflineCanaryModelConfig { + std::string encoder; + std::string decoder; + std::string src_lang; + std::string tgt_lang; + bool use_pnc = true; +}; + struct SHERPA_ONNX_API OfflineFireRedAsrModelConfig { std::string encoder; std::string decoder; @@ -273,6 +281,7 @@ struct SHERPA_ONNX_API OfflineModelConfig { OfflineFireRedAsrModelConfig fire_red_asr; OfflineDolphinModelConfig dolphin; OfflineZipformerCtcModelConfig zipformer_ctc; + OfflineCanaryModelConfig canary; }; struct SHERPA_ONNX_API OfflineLMConfig { @@ -335,6 +344,8 @@ class SHERPA_ONNX_API OfflineRecognizer OfflineRecognizerResult GetResult(const OfflineStream *s) const; + void SetConfig(const OfflineRecognizerConfig &config) const; + private: explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p); }; diff --git a/sherpa-onnx/csrc/sherpa-onnx.cc b/sherpa-onnx/csrc/sherpa-onnx.cc index 3af4f412..6e572cf7 100644 --- a/sherpa-onnx/csrc/sherpa-onnx.cc +++ b/sherpa-onnx/csrc/sherpa-onnx.cc @@ -45,7 +45,7 @@ Usage: ./bin/sherpa-onnx \ --debug=1 \ - --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \ + --zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \ --tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \ ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav \ ./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000001.wav \ diff --git a/wasm/asr/CMakeLists.txt b/wasm/asr/CMakeLists.txt index abb1d684..d1a68193 100644 --- a/wasm/asr/CMakeLists.txt +++ b/wasm/asr/CMakeLists.txt @@ -12,7 +12,6 @@ set(exported_functions SherpaOnnxCreateOnlineRecognizer SherpaOnnxCreateOnlineStream SherpaOnnxDecodeOnlineStream - SherpaOnnxDestroyOfflineStreamResultJson SherpaOnnxDestroyOnlineRecognizer SherpaOnnxDestroyOnlineRecognizerResult SherpaOnnxDestroyOnlineStream diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 78206ec0..54197d86 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -59,6 +59,10 @@ function freeConfig(config, Module) { freeConfig(config.senseVoice, Module) } + if ('canary' in config) { + freeConfig(config.canary, Module) + } + if ('lm' in config) { freeConfig(config.lm, Module) } @@ -246,7 +250,7 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider offset += 4; - Module.setValue(ptr + offset, config.debug || 0, 'i32'); + Module.setValue(ptr + offset, config.debug ?? 1, 'i32'); offset += 4; Module.setValue( @@ -692,6 +696,51 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { } } +function initSherpaOnnxOfflineCanaryModelConfig(config, Module) { + const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; + const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; + const srcLangLen = Module.lengthBytesUTF8(config.srcLang || '') + 1; + const tgtLangLen = Module.lengthBytesUTF8(config.tgtLang || '') + 1; + + const n = encoderLen + decoderLen + srcLangLen + tgtLangLen; + const buffer = Module._malloc(n); + + const len = 5 * 4; // 4 pointers + 1 int32 + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); + offset += encoderLen; + + Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen); + offset += decoderLen; + + Module.stringToUTF8(config.srcLang || '', buffer + offset, srcLangLen); + offset += srcLangLen; + + Module.stringToUTF8(config.tgtLang || '', buffer + offset, tgtLangLen); + offset += tgtLangLen; + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + offset += encoderLen; + + Module.setValue(ptr + 4, buffer + offset, 'i8*'); + offset += decoderLen; + + Module.setValue(ptr + 8, buffer + offset, 'i8*'); + offset += srcLangLen; + + Module.setValue(ptr + 12, buffer + offset, 'i8*'); + offset += tgtLangLen; + + Module.setValue(ptr + 16, config.usePnc ?? 1, 'i32'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + function initSherpaOnnxOfflineMoonshineModelConfig(config, Module) { const preprocessorLen = Module.lengthBytesUTF8(config.preprocessor || '') + 1; const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; @@ -811,7 +860,7 @@ function initSherpaOnnxOfflineSenseVoiceModelConfig(config, Module) { Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += languageLen; - Module.setValue(ptr + 8, config.useInverseTextNormalization || 0, 'i32'); + Module.setValue(ptr + 8, config.useInverseTextNormalization ?? 0, 'i32'); return { buffer: buffer, ptr: ptr, len: len, @@ -907,6 +956,16 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { }; } + if (!('canary' in config)) { + config.canary = { + encoder: '', + decoder: '', + srcLang: '', + tgtLang: '', + usePnc: 1, + }; + } + const transducer = initSherpaOnnxOfflineTransducerModelConfig(config.transducer, Module); @@ -936,9 +995,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const zipformerCtc = initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module); + const canary = initSherpaOnnxOfflineCanaryModelConfig(config.canary, Module); + const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + - dolphin.len + zipformerCtc.len; + dolphin.len + zipformerCtc.len + canary.len; const ptr = Module._malloc(len); @@ -1000,7 +1061,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.debug || 0, 'i32'); + Module.setValue(ptr + offset, config.debug ?? 1, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider @@ -1043,11 +1104,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset); offset += zipformerCtc.len; + Module._CopyHeap(canary.ptr, canary.len, ptr + offset); + offset += canary.len; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, - dolphin: dolphin, zipformerCtc: zipformerCtc + dolphin: dolphin, zipformerCtc: zipformerCtc, canary: canary, } } @@ -1189,6 +1253,13 @@ class OfflineRecognizer { this.Module = Module; } + setConfig(configObj) { + const config = + initSherpaOnnxOfflineRecognizerConfig(configObj, this.Module); + this.Module._SherpaOnnxOfflineRecognizerSetConfig(this.handle, config.ptr); + freeConfig(config, this.Module); + } + free() { this.Module._SherpaOnnxDestroyOfflineRecognizer(this.handle); this.handle = 0 diff --git a/wasm/nodejs/CMakeLists.txt b/wasm/nodejs/CMakeLists.txt index f81e79b8..f070d397 100644 --- a/wasm/nodejs/CMakeLists.txt +++ b/wasm/nodejs/CMakeLists.txt @@ -41,6 +41,7 @@ set(exported_functions SherpaOnnxDestroyOfflineStreamResultJson SherpaOnnxGetOfflineStreamResult SherpaOnnxGetOfflineStreamResultAsJson + SherpaOnnxOfflineRecognizerSetConfig # online kws SherpaOnnxCreateKeywordSpotter SherpaOnnxCreateKeywordStream diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index d93c9812..df969c45 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -21,6 +21,7 @@ static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineMoonshineModelConfig) == 4 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineCanaryModelConfig) == 5 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineModelConfig) == @@ -33,7 +34,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineMoonshineModelConfig) + sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + sizeof(SherpaOnnxOfflineDolphinModelConfig) + - sizeof(SherpaOnnxOfflineZipformerCtcModelConfig), + sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) + + sizeof(SherpaOnnxOfflineCanaryModelConfig), ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); @@ -80,6 +82,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { auto fire_red_asr = &model_config->fire_red_asr; auto dolphin = &model_config->dolphin; auto zipformer_ctc = &model_config->zipformer_ctc; + auto canary = &model_config->canary; fprintf(stdout, "----------offline transducer model config----------\n"); fprintf(stdout, "encoder: %s\n", transducer->encoder); @@ -123,6 +126,13 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "----------offline zipformer ctc model config----------\n"); fprintf(stdout, "model: %s\n", zipformer_ctc->model); + fprintf(stdout, "----------offline NeMo Canary model config----------\n"); + fprintf(stdout, "encoder: %s\n", canary->encoder); + fprintf(stdout, "decoder: %s\n", canary->decoder); + fprintf(stdout, "src_lang: %s\n", canary->src_lang); + fprintf(stdout, "tgt_lang: %s\n", canary->tgt_lang); + fprintf(stdout, "use_pnc: %d\n", canary->use_pnc); + fprintf(stdout, "tokens: %s\n", model_config->tokens); fprintf(stdout, "num_threads: %d\n", model_config->num_threads); fprintf(stdout, "provider: %s\n", model_config->provider);