Add C/CXX/JavaScript API for NeMo Canary models (#2357)
This PR introduces support for NeMo Canary models across C, C++, and JavaScript APIs by adding new Canary configuration structures, updating bindings, extending examples, and enhancing CI workflows. - Add OfflineCanaryModelConfig to all language bindings (C, C++, JS, ETS). - Implement SetConfig methods and NAPI wrappers for updating recognizer config at runtime. - Update examples and CI scripts to demonstrate and test NeMo Canary model usage.
This commit is contained in:
10
.github/scripts/test-nodejs-addon-npm.sh
vendored
10
.github/scripts/test-nodejs-addon-npm.sh
vendored
@@ -10,6 +10,16 @@ arch=$(node -p "require('os').arch()")
|
||||
platform=$(node -p "require('os').platform()")
|
||||
node_version=$(node -p "process.versions.node.split('.')[0]")
|
||||
|
||||
echo "----------non-streaming ASR NeMo Canary----------"
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
|
||||
node ./test_asr_non_streaming_nemo_canary.js
|
||||
|
||||
rm -rf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8
|
||||
|
||||
echo "----------non-streaming ASR Zipformer CTC----------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
|
||||
8
.github/scripts/test-nodejs-npm.sh
vendored
8
.github/scripts/test-nodejs-npm.sh
vendored
@@ -9,6 +9,14 @@ git status
|
||||
ls -lh
|
||||
ls -lh node_modules
|
||||
|
||||
# asr with offline nemo canary
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
|
||||
node ./test-offline-nemo-canary.js
|
||||
rm -rf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8
|
||||
|
||||
# asr with offline zipformer ctc
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
|
||||
10
.github/scripts/test-online-ctc.sh
vendored
10
.github/scripts/test-online-ctc.sh
vendored
@@ -77,16 +77,6 @@ time $EXE \
|
||||
$repo/test_wavs/DEV_T0000000001.wav \
|
||||
$repo/test_wavs/DEV_T0000000002.wav
|
||||
|
||||
log "test int8"
|
||||
|
||||
time $EXE \
|
||||
--debug=1 \
|
||||
--zipformer2-ctc-model=$repo/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
|
||||
--tokens=$repo/tokens.txt \
|
||||
$repo/test_wavs/DEV_T0000000000.wav \
|
||||
$repo/test_wavs/DEV_T0000000001.wav \
|
||||
$repo/test_wavs/DEV_T0000000002.wav
|
||||
|
||||
rm -rf $repo
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
|
||||
30
.github/workflows/c-api.yaml
vendored
30
.github/workflows/c-api.yaml
vendored
@@ -127,6 +127,36 @@ jobs:
|
||||
rm -rf dict lexicon.txt test-hr.wav replace.fst
|
||||
rm -v $name
|
||||
|
||||
- name: Test NeMo Canary
|
||||
shell: bash
|
||||
run: |
|
||||
name=nemo-canary-c-api
|
||||
gcc -o $name ./c-api-examples/$name.c \
|
||||
-I ./build/install/include \
|
||||
-L ./build/install/lib/ \
|
||||
-l sherpa-onnx-c-api \
|
||||
-l onnxruntime
|
||||
|
||||
ls -lh $name
|
||||
|
||||
if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
|
||||
ldd ./$name
|
||||
echo "----"
|
||||
readelf -d ./$name
|
||||
fi
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
|
||||
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./$name
|
||||
|
||||
rm $name
|
||||
rm -rf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8
|
||||
|
||||
- name: Test Dolphin CTC
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
34
.github/workflows/cxx-api.yaml
vendored
34
.github/workflows/cxx-api.yaml
vendored
@@ -87,6 +87,40 @@ jobs:
|
||||
otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib
|
||||
fi
|
||||
|
||||
- name: Test NeMo Canary
|
||||
shell: bash
|
||||
run: |
|
||||
name=nemo-canary-cxx-api
|
||||
g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \
|
||||
-I ./build/install/include \
|
||||
-L ./build/install/lib/ \
|
||||
-l sherpa-onnx-cxx-api \
|
||||
-l sherpa-onnx-c-api \
|
||||
-l onnxruntime
|
||||
|
||||
ls -lh $name
|
||||
|
||||
if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
|
||||
ldd ./$name
|
||||
echo "----"
|
||||
readelf -d ./$name
|
||||
fi
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
|
||||
ls -lh sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8
|
||||
echo "---"
|
||||
|
||||
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./$name
|
||||
|
||||
rm -rf sherpa-onnx-nemo-canary-*
|
||||
rm -v ./$name
|
||||
|
||||
- name: Test streaming zipformer with Homophone replacer
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
@@ -53,6 +53,9 @@ target_link_libraries(whisper-c-api sherpa-onnx-c-api)
|
||||
add_executable(fire-red-asr-c-api fire-red-asr-c-api.c)
|
||||
target_link_libraries(fire-red-asr-c-api sherpa-onnx-c-api)
|
||||
|
||||
add_executable(nemo-canary-c-api nemo-canary-c-api.c)
|
||||
target_link_libraries(nemo-canary-c-api sherpa-onnx-c-api)
|
||||
|
||||
add_executable(sense-voice-c-api sense-voice-c-api.c)
|
||||
target_link_libraries(sense-voice-c-api sherpa-onnx-c-api)
|
||||
|
||||
|
||||
115
c-api-examples/nemo-canary-c-api.c
Normal file
115
c-api-examples/nemo-canary-c-api.c
Normal file
@@ -0,0 +1,115 @@
|
||||
// c-api-examples/nemo-canary-c-api.c
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
// We assume you have pre-downloaded the Nemo Canary model
|
||||
// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
// An example is given below:
|
||||
//
|
||||
// clang-format off
|
||||
//
|
||||
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
// tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
// rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
//
|
||||
// clang-format on
|
||||
//
|
||||
// see https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html
|
||||
// for details
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sherpa-onnx/c-api/c-api.h"
|
||||
|
||||
int32_t main() {
|
||||
const char *wav_filename =
|
||||
"./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/de.wav";
|
||||
const char *encoder_filename =
|
||||
"sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx";
|
||||
const char *decoder_filename =
|
||||
"sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx";
|
||||
const char *tokens_filename =
|
||||
"sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt";
|
||||
const char *provider = "cpu";
|
||||
|
||||
const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
|
||||
if (wave == NULL) {
|
||||
fprintf(stderr, "Failed to read %s\n", wav_filename);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Offline model config
|
||||
SherpaOnnxOfflineModelConfig offline_model_config;
|
||||
memset(&offline_model_config, 0, sizeof(offline_model_config));
|
||||
|
||||
// set debug to 1 to view more logs
|
||||
offline_model_config.debug = 0;
|
||||
|
||||
offline_model_config.num_threads = 1;
|
||||
offline_model_config.provider = provider;
|
||||
offline_model_config.tokens = tokens_filename;
|
||||
offline_model_config.canary.encoder = encoder_filename;
|
||||
offline_model_config.canary.decoder = decoder_filename;
|
||||
|
||||
// so it output punctuations and cases
|
||||
offline_model_config.canary.use_pnc = 1;
|
||||
|
||||
offline_model_config.canary.src_lang = "de";
|
||||
|
||||
// since there is a German audio, you can set tgt_lang to en or de
|
||||
offline_model_config.canary.tgt_lang = "en";
|
||||
|
||||
// Recognizer config
|
||||
SherpaOnnxOfflineRecognizerConfig recognizer_config;
|
||||
memset(&recognizer_config, 0, sizeof(recognizer_config));
|
||||
recognizer_config.decoding_method = "greedy_search";
|
||||
recognizer_config.model_config = offline_model_config;
|
||||
|
||||
const SherpaOnnxOfflineRecognizer *recognizer =
|
||||
SherpaOnnxCreateOfflineRecognizer(&recognizer_config);
|
||||
|
||||
if (recognizer == NULL) {
|
||||
fprintf(stderr, "Please check your config!\n");
|
||||
|
||||
SherpaOnnxFreeWave(wave);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
const SherpaOnnxOfflineStream *stream =
|
||||
SherpaOnnxCreateOfflineStream(recognizer);
|
||||
|
||||
SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, wave->samples,
|
||||
wave->num_samples);
|
||||
SherpaOnnxDecodeOfflineStream(recognizer, stream);
|
||||
const SherpaOnnxOfflineRecognizerResult *result =
|
||||
SherpaOnnxGetOfflineStreamResult(stream);
|
||||
|
||||
fprintf(stderr, "Decoded text (English): %s\n", result->text);
|
||||
|
||||
SherpaOnnxDestroyOfflineRecognizerResult(result);
|
||||
SherpaOnnxDestroyOfflineStream(stream);
|
||||
|
||||
// now output German text
|
||||
recognizer_config.model_config.canary.tgt_lang = "de";
|
||||
SherpaOnnxOfflineRecognizerSetConfig(recognizer, &recognizer_config);
|
||||
|
||||
stream = SherpaOnnxCreateOfflineStream(recognizer);
|
||||
|
||||
SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, wave->samples,
|
||||
wave->num_samples);
|
||||
SherpaOnnxDecodeOfflineStream(recognizer, stream);
|
||||
result = SherpaOnnxGetOfflineStreamResult(stream);
|
||||
|
||||
fprintf(stderr, "Decoded text (German): %s\n", result->text);
|
||||
|
||||
SherpaOnnxDestroyOfflineRecognizerResult(result);
|
||||
SherpaOnnxDestroyOfflineStream(stream);
|
||||
|
||||
SherpaOnnxDestroyOfflineRecognizer(recognizer);
|
||||
SherpaOnnxFreeWave(wave);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -54,7 +54,7 @@ int32_t main() {
|
||||
"DEV_T0000000000.wav";
|
||||
const char *model_filename =
|
||||
"sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/"
|
||||
"ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx";
|
||||
"ctc-epoch-20-avg-1-chunk-16-left-128.onnx";
|
||||
const char *tokens_filename =
|
||||
"sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt";
|
||||
const char *provider = "cpu";
|
||||
|
||||
@@ -27,6 +27,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api)
|
||||
add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc)
|
||||
target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api)
|
||||
|
||||
add_executable(nemo-canary-cxx-api ./nemo-canary-cxx-api.cc)
|
||||
target_link_libraries(nemo-canary-cxx-api sherpa-onnx-cxx-api)
|
||||
|
||||
if(SHERPA_ONNX_ENABLE_PORTAUDIO)
|
||||
add_executable(sense-voice-simulate-streaming-microphone-cxx-api
|
||||
./sense-voice-simulate-streaming-microphone-cxx-api.cc
|
||||
|
||||
101
cxx-api-examples/nemo-canary-cxx-api.cc
Normal file
101
cxx-api-examples/nemo-canary-cxx-api.cc
Normal file
@@ -0,0 +1,101 @@
|
||||
// cxx-api-examples/nemo-canary-cxx-api.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
//
|
||||
// This file demonstrates how to use NeMo Canary models with
|
||||
// sherpa-onnx's C++ API.
|
||||
//
|
||||
// clang-format off
|
||||
//
|
||||
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
// tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
// rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
//
|
||||
// clang-format on
|
||||
//
|
||||
// see https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html
|
||||
// for details
|
||||
|
||||
#include <chrono> // NOLINT
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/c-api/cxx-api.h"
|
||||
|
||||
int32_t main() {
|
||||
using namespace sherpa_onnx::cxx; // NOLINT
|
||||
OfflineRecognizerConfig config;
|
||||
|
||||
config.model_config.canary.encoder =
|
||||
"sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx";
|
||||
config.model_config.canary.decoder =
|
||||
"sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx";
|
||||
|
||||
// our input audio is German, so we set src_lang to "de"
|
||||
config.model_config.canary.src_lang = "de";
|
||||
|
||||
// we can set tgt_lang either to de or en in this specific case
|
||||
config.model_config.canary.tgt_lang = "en";
|
||||
config.model_config.tokens =
|
||||
"sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt";
|
||||
|
||||
config.model_config.num_threads = 1;
|
||||
|
||||
std::cout << "Loading model\n";
|
||||
OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
|
||||
if (!recognizer.Get()) {
|
||||
std::cerr << "Please check your config\n";
|
||||
return -1;
|
||||
}
|
||||
std::cout << "Loading model done\n";
|
||||
|
||||
std::string wave_filename =
|
||||
"./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/de.wav";
|
||||
|
||||
Wave wave = ReadWave(wave_filename);
|
||||
if (wave.samples.empty()) {
|
||||
std::cerr << "Failed to read: '" << wave_filename << "'\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::cout << "Start recognition\n";
|
||||
const auto begin = std::chrono::steady_clock::now();
|
||||
|
||||
OfflineStream stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
|
||||
wave.samples.size());
|
||||
|
||||
recognizer.Decode(&stream);
|
||||
|
||||
OfflineRecognizerResult result = recognizer.GetResult(&stream);
|
||||
|
||||
const auto end = std::chrono::steady_clock::now();
|
||||
const float elapsed_seconds =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(end - begin)
|
||||
.count() /
|
||||
1000.;
|
||||
float duration = wave.samples.size() / static_cast<float>(wave.sample_rate);
|
||||
float rtf = elapsed_seconds / duration;
|
||||
|
||||
std::cout << "text (English): " << result.text << "\n";
|
||||
printf("Number of threads: %d\n", config.model_config.num_threads);
|
||||
printf("Duration: %.3fs\n", duration);
|
||||
printf("Elapsed seconds: %.3fs\n", elapsed_seconds);
|
||||
printf("(Real time factor) RTF = %.3f / %.3f = %.3f\n", elapsed_seconds,
|
||||
duration, rtf);
|
||||
|
||||
// now output text in German
|
||||
config.model_config.canary.tgt_lang = "de";
|
||||
recognizer.SetConfig(config);
|
||||
stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(wave.sample_rate, wave.samples.data(),
|
||||
wave.samples.size());
|
||||
|
||||
recognizer.Decode(&stream);
|
||||
|
||||
result = recognizer.GetResult(&stream);
|
||||
std::cout << "text (German): " << result.text << "\n";
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -7,6 +7,7 @@ export { Samples,
|
||||
OfflineStream,
|
||||
FeatureConfig,
|
||||
HomophoneReplacerConfig,
|
||||
OfflineCanaryModelConfig,
|
||||
OfflineDolphinModelConfig,
|
||||
OfflineTransducerModelConfig,
|
||||
OfflineParaformerModelConfig,
|
||||
|
||||
@@ -93,6 +93,27 @@ static SherpaOnnxOfflineNemoEncDecCtcModelConfig GetOfflineNeMoCtcModelConfig(
|
||||
return c;
|
||||
}
|
||||
|
||||
static SherpaOnnxOfflineCanaryModelConfig GetOfflineCanaryModelConfig(
|
||||
Napi::Object obj) {
|
||||
SherpaOnnxOfflineCanaryModelConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
c.use_pnc = 1; // Align default with JS default
|
||||
|
||||
if (!obj.Has("canary") || !obj.Get("canary").IsObject()) {
|
||||
return c;
|
||||
}
|
||||
|
||||
Napi::Object o = obj.Get("canary").As<Napi::Object>();
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(encoder, encoder);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(decoder, decoder);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(src_lang, srcLang);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(tgt_lang, tgtLang);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(use_pnc, usePnc);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static SherpaOnnxOfflineWhisperModelConfig GetOfflineWhisperModelConfig(
|
||||
Napi::Object obj) {
|
||||
SherpaOnnxOfflineWhisperModelConfig c;
|
||||
@@ -203,6 +224,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
|
||||
c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o);
|
||||
c.dolphin = GetOfflineDolphinModelConfig(o);
|
||||
c.zipformer_ctc = GetOfflineZipformerCtcModelConfig(o);
|
||||
c.canary = GetOfflineCanaryModelConfig(o);
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
|
||||
@@ -241,6 +263,78 @@ static SherpaOnnxOfflineLMConfig GetOfflineLMConfig(Napi::Object obj) {
|
||||
return c;
|
||||
}
|
||||
|
||||
static SherpaOnnxOfflineRecognizerConfig ParseConfig(Napi::Object o) {
|
||||
SherpaOnnxOfflineRecognizerConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
c.feat_config = GetFeatureConfig(o);
|
||||
c.model_config = GetOfflineModelConfig(o);
|
||||
c.lm_config = GetOfflineLMConfig(o);
|
||||
c.hr = GetHomophoneReplacerConfig(o);
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(decoding_method, decodingMethod);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(max_active_paths, maxActivePaths);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(hotwords_file, hotwordsFile);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(hotwords_score, hotwordsScore);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(blank_penalty, blankPenalty);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static void FreeConfig(const SherpaOnnxOfflineRecognizerConfig &c) {
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.decoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.joiner);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.paraformer.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.nemo_ctc.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.decoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.language);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.task);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.tdnn.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.model);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.language);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.preprocessor);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.uncached_decoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.cached_decoder);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.decoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.src_lang);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.canary.tgt_lang);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.provider);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.model_type);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.modeling_unit);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.bpe_vocab);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.telespeech_ctc);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.lm_config.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.decoding_method);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hotwords_file);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.rule_fsts);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.rule_fars);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hr.dict_dir);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hr.lexicon);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hr.rule_fsts);
|
||||
}
|
||||
|
||||
static Napi::External<SherpaOnnxOfflineRecognizer>
|
||||
CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
@@ -274,20 +368,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
|
||||
|
||||
Napi::Object o = info[0].As<Napi::Object>();
|
||||
|
||||
SherpaOnnxOfflineRecognizerConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
c.feat_config = GetFeatureConfig(o);
|
||||
c.model_config = GetOfflineModelConfig(o);
|
||||
c.lm_config = GetOfflineLMConfig(o);
|
||||
c.hr = GetHomophoneReplacerConfig(o);
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(decoding_method, decodingMethod);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(max_active_paths, maxActivePaths);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(hotwords_file, hotwordsFile);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(hotwords_score, hotwordsScore);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(blank_penalty, blankPenalty);
|
||||
SherpaOnnxOfflineRecognizerConfig c = ParseConfig(o);
|
||||
|
||||
#if __OHOS__
|
||||
std::unique_ptr<NativeResourceManager,
|
||||
@@ -302,51 +383,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
|
||||
SherpaOnnxCreateOfflineRecognizer(&c);
|
||||
#endif
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.decoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.transducer.joiner);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.paraformer.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.nemo_ctc.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.decoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.language);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.whisper.task);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.tdnn.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.model);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.sense_voice.language);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.preprocessor);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.uncached_decoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.moonshine.cached_decoder);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.encoder);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.provider);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.model_type);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.modeling_unit);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.bpe_vocab);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.telespeech_ctc);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.lm_config.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.decoding_method);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hotwords_file);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.rule_fsts);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.rule_fars);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hr.dict_dir);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hr.lexicon);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.hr.rule_fsts);
|
||||
FreeConfig(c);
|
||||
|
||||
if (!recognizer) {
|
||||
Napi::TypeError::New(env, "Please check your config!")
|
||||
@@ -470,6 +507,43 @@ static void AcceptWaveformOfflineWrapper(const Napi::CallbackInfo &info) {
|
||||
#endif
|
||||
}
|
||||
|
||||
static void OfflineRecognizerSetConfigWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 2) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 2 arguments. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 0 should be an offline recognizer pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[1].IsObject()) {
|
||||
Napi::TypeError::New(env, "Expect an object as the second argument")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
Napi::Object o = info[1].As<Napi::Object>();
|
||||
SherpaOnnxOfflineRecognizerConfig c = ParseConfig(o);
|
||||
|
||||
const SherpaOnnxOfflineRecognizer *recognizer =
|
||||
info[0].As<Napi::External<SherpaOnnxOfflineRecognizer>>().Data();
|
||||
|
||||
SherpaOnnxOfflineRecognizerSetConfig(recognizer, &c);
|
||||
|
||||
FreeConfig(c);
|
||||
}
|
||||
|
||||
static void DecodeOfflineStreamWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 2) {
|
||||
@@ -548,6 +622,9 @@ void InitNonStreamingAsr(Napi::Env env, Napi::Object exports) {
|
||||
exports.Set(Napi::String::New(env, "decodeOfflineStream"),
|
||||
Napi::Function::New(env, DecodeOfflineStreamWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "offlineRecognizerSetConfig"),
|
||||
Napi::Function::New(env, OfflineRecognizerSetConfigWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "getOfflineStreamResultAsJson"),
|
||||
Napi::Function::New(env, GetOfflineStreamResultAsJsonWrapper));
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ export const voiceActivityDetectorFlush: (handle: object) => void;
|
||||
|
||||
export const createOfflineRecognizer: (config: object, mgr?: object) => object;
|
||||
export const createOfflineStream: (handle: object) => object;
|
||||
export const offlineRecognizerSetConfig: (handle: object, config: object) => void;
|
||||
export const acceptWaveformOffline: (handle: object, audio: object) => void;
|
||||
export const decodeOfflineStream: (handle: object, streamHandle: object) => void;
|
||||
export const getOfflineStreamResultAsJson: (streamHandle: object) => string;
|
||||
|
||||
@@ -4,6 +4,7 @@ import {
|
||||
createOfflineStream,
|
||||
decodeOfflineStream,
|
||||
getOfflineStreamResultAsJson,
|
||||
offlineRecognizerSetConfig,
|
||||
} from 'libsherpa_onnx.so';
|
||||
|
||||
export interface Samples {
|
||||
@@ -67,6 +68,14 @@ export class OfflineWhisperModelConfig {
|
||||
public tailPaddings: number = -1;
|
||||
}
|
||||
|
||||
export class OfflineCanaryModelConfig {
|
||||
public encoder: string = '';
|
||||
public decoder: string = '';
|
||||
public srcLang: string = '';
|
||||
public tgtLang: string = '';
|
||||
public usePnc: number = 1;
|
||||
}
|
||||
|
||||
export class OfflineTdnnModelConfig {
|
||||
public model: string = '';
|
||||
}
|
||||
@@ -102,6 +111,7 @@ export class OfflineModelConfig {
|
||||
public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig();
|
||||
public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig();
|
||||
public zipformerCtc: OfflineZipformerCtcModelConfig = new OfflineZipformerCtcModelConfig();
|
||||
public canary: OfflineCanaryModelConfig = new OfflineCanaryModelConfig();
|
||||
}
|
||||
|
||||
export class OfflineLMConfig {
|
||||
@@ -151,6 +161,10 @@ export class OfflineRecognizer {
|
||||
this.config = config
|
||||
}
|
||||
|
||||
setConfig(config: OfflineRecognizerConfig) {
|
||||
offlineRecognizerSetConfig(this.handle, config);
|
||||
}
|
||||
|
||||
createStream(): OfflineStream {
|
||||
const handle: object = createOfflineStream(this.handle);
|
||||
return new OfflineStream(handle);
|
||||
|
||||
@@ -123,6 +123,7 @@ The following tables list the examples in this folder.
|
||||
|[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)|
|
||||
|[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)|
|
||||
|[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
|
||||
|[./test_asr_non_streaming_nemo_canary.js](./test_asr_non_streaming_nemo_canary.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [Canary](https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html#sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8-english-spanish-german-french) model|
|
||||
|[./test_asr_non_streaming_zipformer_ctc.js](./test_asr_non_streaming_zipformer_ctc.js)|Non-streaming speech recognition from a file using a Zipformer CTC model with greedy search|
|
||||
|[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search|
|
||||
|[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search|
|
||||
@@ -389,6 +390,16 @@ npm install naudiodon2
|
||||
node ./test_vad_asr_non_streaming_zipformer_ctc_microphone.js
|
||||
```
|
||||
|
||||
### Non-streaming speech recognition with NeMo Canary models
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
|
||||
node ./test_asr_non_streaming_nemo_canary.js
|
||||
```
|
||||
|
||||
### Non-streaming speech recognition with NeMo CTC models
|
||||
|
||||
```bash
|
||||
|
||||
62
nodejs-addon-examples/test_asr_non_streaming_nemo_canary.js
Normal file
62
nodejs-addon-examples/test_asr_non_streaming_nemo_canary.js
Normal file
@@ -0,0 +1,62 @@
|
||||
// Copyright (c) 2024 Xiaomi Corporation
|
||||
const sherpa_onnx = require('sherpa-onnx-node');
|
||||
|
||||
// Please download test files from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
const config = {
|
||||
'featConfig': {
|
||||
'sampleRate': 16000,
|
||||
'featureDim': 80,
|
||||
},
|
||||
'modelConfig': {
|
||||
'canary': {
|
||||
'encoder':
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx',
|
||||
'decoder':
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx',
|
||||
'srcLang': 'en',
|
||||
'tgtLang': 'en',
|
||||
'usePnc': 1,
|
||||
},
|
||||
'tokens':
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt',
|
||||
'numThreads': 2,
|
||||
'provider': 'cpu',
|
||||
'debug': 0,
|
||||
}
|
||||
};
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(config);
|
||||
console.log('Started')
|
||||
let start = Date.now();
|
||||
let stream = recognizer.createStream();
|
||||
const wave = sherpa_onnx.readWave(waveFilename);
|
||||
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
|
||||
|
||||
recognizer.decode(stream);
|
||||
let result = recognizer.getResult(stream)
|
||||
let stop = Date.now();
|
||||
console.log('Done')
|
||||
|
||||
const elapsed_seconds = (stop - start) / 1000;
|
||||
const duration = wave.samples.length / wave.sampleRate;
|
||||
const real_time_factor = elapsed_seconds / duration;
|
||||
console.log('Wave duration', duration.toFixed(3), 'seconds')
|
||||
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
|
||||
console.log(
|
||||
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
|
||||
real_time_factor.toFixed(3))
|
||||
console.log(waveFilename)
|
||||
console.log('result (English)\n', result)
|
||||
|
||||
stream = recognizer.createStream();
|
||||
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
|
||||
recognizer.config.modelConfig.canary.tgtLang = 'de';
|
||||
recognizer.setConfig(recognizer.config);
|
||||
|
||||
recognizer.decode(stream);
|
||||
result = recognizer.getResult(stream)
|
||||
console.log('result (German)\n', result)
|
||||
@@ -63,7 +63,7 @@ for text-to-speech.
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
tar xf kokoro-en-v0_19.tar.bz2
|
||||
rm kokoro-en-v0_19.tar.bz2
|
||||
|
||||
@@ -154,6 +154,22 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
||||
node ./test-offline-dolphin-ctc.js
|
||||
```
|
||||
|
||||
## ./test-offline-nemo-canary.js
|
||||
|
||||
[./test-offline-nemo-canary.js](./test-offline-nemo-canary.js) demonstrates
|
||||
how to decode a file with a NeMo Canary model. In the code we use
|
||||
[sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8](https://k2-fsa.github.io/sherpa/onnx/nemo/canary.html#sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8-english-spanish-german-french).
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
rm sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8.tar.bz2
|
||||
|
||||
node ./test-offline-nemo-canary.js
|
||||
```
|
||||
|
||||
## ./test-offline-zipformer-ctc.js
|
||||
|
||||
[./test-offline-zipformer-ctc.js](./test-offline-zipformer-ctc.js) demonstrates
|
||||
|
||||
56
nodejs-examples/test-offline-nemo-canary.js
Normal file
56
nodejs-examples/test-offline-nemo-canary.js
Normal file
@@ -0,0 +1,56 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
let config = {
|
||||
modelConfig: {
|
||||
canary: {
|
||||
encoder:
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/encoder.int8.onnx',
|
||||
decoder:
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/decoder.int8.onnx',
|
||||
srcLang: 'en',
|
||||
tgtLang: 'en',
|
||||
usePnc: 1,
|
||||
},
|
||||
debug: 0,
|
||||
tokens:
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/tokens.txt',
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOfflineRecognizer(config);
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
let stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-nemo-canary-180m-flash-en-es-de-fr-int8/test_wavs/en.wav';
|
||||
const wave = sherpa_onnx.readWave(waveFilename);
|
||||
stream.acceptWaveform(wave.sampleRate, wave.samples);
|
||||
|
||||
recognizer.decode(stream);
|
||||
let text = recognizer.getResult(stream).text;
|
||||
console.log(`text in English: ${text}`);
|
||||
|
||||
stream.free();
|
||||
|
||||
// now output German text
|
||||
recognizer.config.modelConfig.canary.tgtLang = 'de';
|
||||
recognizer.setConfig(recognizer.config);
|
||||
|
||||
stream = recognizer.createStream();
|
||||
stream.acceptWaveform(wave.sampleRate, wave.samples);
|
||||
recognizer.decode(stream);
|
||||
text = recognizer.getResult(stream).text;
|
||||
|
||||
console.log(`text in German: ${text}`);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
@@ -24,6 +24,10 @@ class OfflineRecognizer {
|
||||
return new OfflineStream(handle);
|
||||
}
|
||||
|
||||
setConfig(config) {
|
||||
addon.offlineRecognizerSetConfig(this.handle, config);
|
||||
}
|
||||
|
||||
decode(stream) {
|
||||
addon.decodeOfflineStream(this.handle, stream.handle);
|
||||
}
|
||||
|
||||
@@ -487,6 +487,21 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig(
|
||||
recognizer_config.model_config.zipformer_ctc.model =
|
||||
SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, "");
|
||||
|
||||
recognizer_config.model_config.canary.encoder =
|
||||
SHERPA_ONNX_OR(config->model_config.canary.encoder, "");
|
||||
|
||||
recognizer_config.model_config.canary.decoder =
|
||||
SHERPA_ONNX_OR(config->model_config.canary.decoder, "");
|
||||
|
||||
recognizer_config.model_config.canary.src_lang =
|
||||
SHERPA_ONNX_OR(config->model_config.canary.src_lang, "");
|
||||
|
||||
recognizer_config.model_config.canary.tgt_lang =
|
||||
SHERPA_ONNX_OR(config->model_config.canary.tgt_lang, "");
|
||||
|
||||
recognizer_config.model_config.canary.use_pnc =
|
||||
config->model_config.canary.use_pnc;
|
||||
|
||||
recognizer_config.lm_config.model =
|
||||
SHERPA_ONNX_OR(config->lm_config.model, "");
|
||||
recognizer_config.lm_config.scale =
|
||||
|
||||
@@ -420,6 +420,14 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineWhisperModelConfig {
|
||||
int32_t tail_paddings;
|
||||
} SherpaOnnxOfflineWhisperModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineCanaryModelConfig {
|
||||
const char *encoder;
|
||||
const char *decoder;
|
||||
const char *src_lang;
|
||||
const char *tgt_lang;
|
||||
int32_t use_pnc;
|
||||
} SherpaOnnxOfflineCanaryModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineFireRedAsrModelConfig {
|
||||
const char *encoder;
|
||||
const char *decoder;
|
||||
@@ -479,6 +487,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
|
||||
SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr;
|
||||
SherpaOnnxOfflineDolphinModelConfig dolphin;
|
||||
SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc;
|
||||
SherpaOnnxOfflineCanaryModelConfig canary;
|
||||
} SherpaOnnxOfflineModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {
|
||||
|
||||
@@ -193,7 +193,7 @@ void OfflineStream::AcceptWaveform(int32_t sample_rate, const float *samples,
|
||||
SherpaOnnxAcceptWaveformOffline(p_, sample_rate, samples, n);
|
||||
}
|
||||
|
||||
OfflineRecognizer OfflineRecognizer::Create(
|
||||
static SherpaOnnxOfflineRecognizerConfig Convert(
|
||||
const OfflineRecognizerConfig &config) {
|
||||
struct SherpaOnnxOfflineRecognizerConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
@@ -256,6 +256,12 @@ OfflineRecognizer OfflineRecognizer::Create(
|
||||
c.model_config.zipformer_ctc.model =
|
||||
config.model_config.zipformer_ctc.model.c_str();
|
||||
|
||||
c.model_config.canary.encoder = config.model_config.canary.encoder.c_str();
|
||||
c.model_config.canary.decoder = config.model_config.canary.decoder.c_str();
|
||||
c.model_config.canary.src_lang = config.model_config.canary.src_lang.c_str();
|
||||
c.model_config.canary.tgt_lang = config.model_config.canary.tgt_lang.c_str();
|
||||
c.model_config.canary.use_pnc = config.model_config.canary.use_pnc;
|
||||
|
||||
c.lm_config.model = config.lm_config.model.c_str();
|
||||
c.lm_config.scale = config.lm_config.scale;
|
||||
|
||||
@@ -273,10 +279,22 @@ OfflineRecognizer OfflineRecognizer::Create(
|
||||
c.hr.lexicon = config.hr.lexicon.c_str();
|
||||
c.hr.rule_fsts = config.hr.rule_fsts.c_str();
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
OfflineRecognizer OfflineRecognizer::Create(
|
||||
const OfflineRecognizerConfig &config) {
|
||||
auto c = Convert(config);
|
||||
|
||||
auto p = SherpaOnnxCreateOfflineRecognizer(&c);
|
||||
return OfflineRecognizer(p);
|
||||
}
|
||||
|
||||
void OfflineRecognizer::SetConfig(const OfflineRecognizerConfig &config) const {
|
||||
auto c = Convert(config);
|
||||
SherpaOnnxOfflineRecognizerSetConfig(p_, &c);
|
||||
}
|
||||
|
||||
OfflineRecognizer::OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p)
|
||||
: MoveOnly<OfflineRecognizer, SherpaOnnxOfflineRecognizer>(p) {}
|
||||
|
||||
|
||||
@@ -223,6 +223,14 @@ struct SHERPA_ONNX_API OfflineWhisperModelConfig {
|
||||
int32_t tail_paddings = -1;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineCanaryModelConfig {
|
||||
std::string encoder;
|
||||
std::string decoder;
|
||||
std::string src_lang;
|
||||
std::string tgt_lang;
|
||||
bool use_pnc = true;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineFireRedAsrModelConfig {
|
||||
std::string encoder;
|
||||
std::string decoder;
|
||||
@@ -273,6 +281,7 @@ struct SHERPA_ONNX_API OfflineModelConfig {
|
||||
OfflineFireRedAsrModelConfig fire_red_asr;
|
||||
OfflineDolphinModelConfig dolphin;
|
||||
OfflineZipformerCtcModelConfig zipformer_ctc;
|
||||
OfflineCanaryModelConfig canary;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineLMConfig {
|
||||
@@ -335,6 +344,8 @@ class SHERPA_ONNX_API OfflineRecognizer
|
||||
|
||||
OfflineRecognizerResult GetResult(const OfflineStream *s) const;
|
||||
|
||||
void SetConfig(const OfflineRecognizerConfig &config) const;
|
||||
|
||||
private:
|
||||
explicit OfflineRecognizer(const SherpaOnnxOfflineRecognizer *p);
|
||||
};
|
||||
|
||||
@@ -45,7 +45,7 @@ Usage:
|
||||
|
||||
./bin/sherpa-onnx \
|
||||
--debug=1 \
|
||||
--zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
|
||||
--zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
|
||||
--tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \
|
||||
./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav \
|
||||
./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000001.wav \
|
||||
|
||||
@@ -12,7 +12,6 @@ set(exported_functions
|
||||
SherpaOnnxCreateOnlineRecognizer
|
||||
SherpaOnnxCreateOnlineStream
|
||||
SherpaOnnxDecodeOnlineStream
|
||||
SherpaOnnxDestroyOfflineStreamResultJson
|
||||
SherpaOnnxDestroyOnlineRecognizer
|
||||
SherpaOnnxDestroyOnlineRecognizerResult
|
||||
SherpaOnnxDestroyOnlineStream
|
||||
|
||||
@@ -59,6 +59,10 @@ function freeConfig(config, Module) {
|
||||
freeConfig(config.senseVoice, Module)
|
||||
}
|
||||
|
||||
if ('canary' in config) {
|
||||
freeConfig(config.canary, Module)
|
||||
}
|
||||
|
||||
if ('lm' in config) {
|
||||
freeConfig(config.lm, Module)
|
||||
}
|
||||
@@ -246,7 +250,7 @@ function initSherpaOnnxOnlineModelConfig(config, Module) {
|
||||
Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider
|
||||
offset += 4;
|
||||
|
||||
Module.setValue(ptr + offset, config.debug || 0, 'i32');
|
||||
Module.setValue(ptr + offset, config.debug ?? 1, 'i32');
|
||||
offset += 4;
|
||||
|
||||
Module.setValue(
|
||||
@@ -692,6 +696,51 @@ function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
|
||||
}
|
||||
}
|
||||
|
||||
function initSherpaOnnxOfflineCanaryModelConfig(config, Module) {
|
||||
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
|
||||
const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
|
||||
const srcLangLen = Module.lengthBytesUTF8(config.srcLang || '') + 1;
|
||||
const tgtLangLen = Module.lengthBytesUTF8(config.tgtLang || '') + 1;
|
||||
|
||||
const n = encoderLen + decoderLen + srcLangLen + tgtLangLen;
|
||||
const buffer = Module._malloc(n);
|
||||
|
||||
const len = 5 * 4; // 4 pointers + 1 int32
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
let offset = 0;
|
||||
Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen);
|
||||
offset += encoderLen;
|
||||
|
||||
Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen);
|
||||
offset += decoderLen;
|
||||
|
||||
Module.stringToUTF8(config.srcLang || '', buffer + offset, srcLangLen);
|
||||
offset += srcLangLen;
|
||||
|
||||
Module.stringToUTF8(config.tgtLang || '', buffer + offset, tgtLangLen);
|
||||
offset += tgtLangLen;
|
||||
|
||||
offset = 0;
|
||||
Module.setValue(ptr, buffer + offset, 'i8*');
|
||||
offset += encoderLen;
|
||||
|
||||
Module.setValue(ptr + 4, buffer + offset, 'i8*');
|
||||
offset += decoderLen;
|
||||
|
||||
Module.setValue(ptr + 8, buffer + offset, 'i8*');
|
||||
offset += srcLangLen;
|
||||
|
||||
Module.setValue(ptr + 12, buffer + offset, 'i8*');
|
||||
offset += tgtLangLen;
|
||||
|
||||
Module.setValue(ptr + 16, config.usePnc ?? 1, 'i32');
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len,
|
||||
}
|
||||
}
|
||||
|
||||
function initSherpaOnnxOfflineMoonshineModelConfig(config, Module) {
|
||||
const preprocessorLen = Module.lengthBytesUTF8(config.preprocessor || '') + 1;
|
||||
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
|
||||
@@ -811,7 +860,7 @@ function initSherpaOnnxOfflineSenseVoiceModelConfig(config, Module) {
|
||||
Module.setValue(ptr + 4, buffer + offset, 'i8*');
|
||||
offset += languageLen;
|
||||
|
||||
Module.setValue(ptr + 8, config.useInverseTextNormalization || 0, 'i32');
|
||||
Module.setValue(ptr + 8, config.useInverseTextNormalization ?? 0, 'i32');
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len,
|
||||
@@ -907,6 +956,16 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
||||
};
|
||||
}
|
||||
|
||||
if (!('canary' in config)) {
|
||||
config.canary = {
|
||||
encoder: '',
|
||||
decoder: '',
|
||||
srcLang: '',
|
||||
tgtLang: '',
|
||||
usePnc: 1,
|
||||
};
|
||||
}
|
||||
|
||||
const transducer =
|
||||
initSherpaOnnxOfflineTransducerModelConfig(config.transducer, Module);
|
||||
|
||||
@@ -936,9 +995,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
||||
const zipformerCtc =
|
||||
initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module);
|
||||
|
||||
const canary = initSherpaOnnxOfflineCanaryModelConfig(config.canary, Module);
|
||||
|
||||
const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
|
||||
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
|
||||
dolphin.len + zipformerCtc.len;
|
||||
dolphin.len + zipformerCtc.len + canary.len;
|
||||
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
@@ -1000,7 +1061,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
||||
Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
|
||||
offset += 4;
|
||||
|
||||
Module.setValue(ptr + offset, config.debug || 0, 'i32');
|
||||
Module.setValue(ptr + offset, config.debug ?? 1, 'i32');
|
||||
offset += 4;
|
||||
|
||||
Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider
|
||||
@@ -1043,11 +1104,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
||||
Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset);
|
||||
offset += zipformerCtc.len;
|
||||
|
||||
Module._CopyHeap(canary.ptr, canary.len, ptr + offset);
|
||||
offset += canary.len;
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len, transducer: transducer,
|
||||
paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
|
||||
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
|
||||
dolphin: dolphin, zipformerCtc: zipformerCtc
|
||||
dolphin: dolphin, zipformerCtc: zipformerCtc, canary: canary,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1189,6 +1253,13 @@ class OfflineRecognizer {
|
||||
this.Module = Module;
|
||||
}
|
||||
|
||||
setConfig(configObj) {
|
||||
const config =
|
||||
initSherpaOnnxOfflineRecognizerConfig(configObj, this.Module);
|
||||
this.Module._SherpaOnnxOfflineRecognizerSetConfig(this.handle, config.ptr);
|
||||
freeConfig(config, this.Module);
|
||||
}
|
||||
|
||||
free() {
|
||||
this.Module._SherpaOnnxDestroyOfflineRecognizer(this.handle);
|
||||
this.handle = 0
|
||||
|
||||
@@ -41,6 +41,7 @@ set(exported_functions
|
||||
SherpaOnnxDestroyOfflineStreamResultJson
|
||||
SherpaOnnxGetOfflineStreamResult
|
||||
SherpaOnnxGetOfflineStreamResultAsJson
|
||||
SherpaOnnxOfflineRecognizerSetConfig
|
||||
# online kws
|
||||
SherpaOnnxCreateKeywordSpotter
|
||||
SherpaOnnxCreateKeywordStream
|
||||
|
||||
@@ -21,6 +21,7 @@ static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineMoonshineModelConfig) == 4 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineTdnnModelConfig) == 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) == 3 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineCanaryModelConfig) == 5 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineLMConfig) == 2 * 4, "");
|
||||
|
||||
static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
|
||||
@@ -33,7 +34,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
|
||||
sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineDolphinModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineZipformerCtcModelConfig),
|
||||
sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineCanaryModelConfig),
|
||||
|
||||
"");
|
||||
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
|
||||
@@ -80,6 +82,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
|
||||
auto fire_red_asr = &model_config->fire_red_asr;
|
||||
auto dolphin = &model_config->dolphin;
|
||||
auto zipformer_ctc = &model_config->zipformer_ctc;
|
||||
auto canary = &model_config->canary;
|
||||
|
||||
fprintf(stdout, "----------offline transducer model config----------\n");
|
||||
fprintf(stdout, "encoder: %s\n", transducer->encoder);
|
||||
@@ -123,6 +126,13 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
|
||||
fprintf(stdout, "----------offline zipformer ctc model config----------\n");
|
||||
fprintf(stdout, "model: %s\n", zipformer_ctc->model);
|
||||
|
||||
fprintf(stdout, "----------offline NeMo Canary model config----------\n");
|
||||
fprintf(stdout, "encoder: %s\n", canary->encoder);
|
||||
fprintf(stdout, "decoder: %s\n", canary->decoder);
|
||||
fprintf(stdout, "src_lang: %s\n", canary->src_lang);
|
||||
fprintf(stdout, "tgt_lang: %s\n", canary->tgt_lang);
|
||||
fprintf(stdout, "use_pnc: %d\n", canary->use_pnc);
|
||||
|
||||
fprintf(stdout, "tokens: %s\n", model_config->tokens);
|
||||
fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
|
||||
fprintf(stdout, "provider: %s\n", model_config->provider);
|
||||
|
||||
Reference in New Issue
Block a user