diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 088961f3..45975393 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -28,9 +28,13 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PAT ``` ## Streaming speech recognition with zipformer transducer + ```bash wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +node ./test_asr_streaming_transducer.js + +node ./test_asr_streaming_transducer_microphone.js ``` diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index d69336d1..82958bd7 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,6 +1,7 @@ { "dependencies": { - "sherpa-onnx-node": "*", - "perf_hooks": "*" + "naudiodon2": "^2.4.0", + "perf_hooks": "*", + "sherpa-onnx-node": "*" } } diff --git a/nodejs-addon-examples/test_asr_streaming_transducer_microphone.js b/nodejs-addon-examples/test_asr_streaming_transducer_microphone.js new file mode 100644 index 00000000..c8523191 --- /dev/null +++ b/nodejs-addon-examples/test_asr_streaming_transducer_microphone.js @@ -0,0 +1,92 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createOnlineRecognizer() { + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'transducer': { + 'encoder': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx', + 'decoder': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx', + 'joiner': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx', + }, + 'tokens': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + 'modelType': 'zipformer', + }, + 'decodingMethod': 'greedy_search', + 'maxActivePaths': 4, + 'enableEndpoint': true, + 'rule1MinTrailingSilence': 2.4, + 'rule2MinTrailingSilence': 1.2, + 'rule3MinUtteranceLength': 20 + }; + + return new sherpa_onnx.OnlineRecognizer(config); +} + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); + +let lastText = ''; +let segmentIndex = 0; + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: recognizer.config.featConfig.sampleRate + } +}); + +const display = new sherpa_onnx.Display(50); + +ai.on('data', data => { + const samples = new Float32Array(data.buffer); + + stream.acceptWaveform(samples, recognizer.config.featConfig.sampleRate); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + const isEndpoint = recognizer.isEndpoint(stream); + const text = recognizer.getResult(stream).text; + + if (text.length > 0 && lastText != text) { + lastText = text; + display.print(segmentIndex, lastText); + } + if (isEndpoint) { + if (text.length > 0) { + lastText = text; + segmentIndex += 1; + } + recognizer.reset(stream) + } +}); + +ai.on('close', () => { + console.log('Free resources'); + stream.free(); + recognizer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/scripts/node-addon-api/lib/sherpa-onnx.js b/scripts/node-addon-api/lib/sherpa-onnx.js index 407118ec..b9b343f7 100644 --- a/scripts/node-addon-api/lib/sherpa-onnx.js +++ b/scripts/node-addon-api/lib/sherpa-onnx.js @@ -4,4 +4,5 @@ const streaming_asr = require('./streaming-asr.js'); module.exports = { OnlineRecognizer: streaming_asr.OnlineRecognizer, readWave: addon.readWave, + Display: streaming_asr.Display, } diff --git a/scripts/node-addon-api/lib/streaming-asr.js b/scripts/node-addon-api/lib/streaming-asr.js index e08752d9..21c8632e 100644 --- a/scripts/node-addon-api/lib/streaming-asr.js +++ b/scripts/node-addon-api/lib/streaming-asr.js @@ -1,5 +1,15 @@ const addon = require('./addon.js'); +class Display { + constructor(maxWordPerline) { + this.handle = addon.createDisplay(maxWordPerline); + } + + print(idx, text) { + addon.print(this.handle, idx, text) + } +} + class OnlineStream { constructor(handle) { this.handle = handle; @@ -10,11 +20,16 @@ class OnlineStream { addon.acceptWaveformOnline( this.handle, {samples: samples, sampleRate: sampleRate}) } + + inputFinished() { + addon.inputFinished(this.handle) + } } class OnlineRecognizer { constructor(config) { this.handle = addon.createOnlineRecognizer(config); + this.config = config } createStream() { @@ -30,6 +45,14 @@ class OnlineRecognizer { addon.decodeOnlineStream(this.handle, stream.handle); } + isEndpoint(stream) { + return addon.isEndpoint(this.handle, stream.handle); + } + + reset(stream) { + addon.reset(this.handle, stream.handle); + } + getResult(stream) { const jsonStr = addon.getOnlineStreamResultAsJson(this.handle, stream.handle); @@ -38,4 +61,7 @@ class OnlineRecognizer { } } -module.exports = {OnlineRecognizer} +module.exports = { + OnlineRecognizer, + Display +} diff --git a/scripts/node-addon-api/src/streaming-asr.cc b/scripts/node-addon-api/src/streaming-asr.cc index 66e025cf..f1aee13b 100644 --- a/scripts/node-addon-api/src/streaming-asr.cc +++ b/scripts/node-addon-api/src/streaming-asr.cc @@ -166,6 +166,69 @@ static Napi::External CreateOnlineRecognizerWrapper( memset(&c, 0, sizeof(c)); c.feat_config = GetFeatureConfig(config); c.model_config = GetOnlineModelConfig(config); + + if (config.Has("decodingMethod") && config.Get("decodingMethod").IsString()) { + Napi::String decoding_method = + config.Get("decodingMethod").As(); + std::string s = decoding_method.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.decoding_method = p; + } + + if (config.Has("maxActivePaths") && config.Get("maxActivePaths").IsNumber()) { + c.max_active_paths = + config.Get("maxActivePaths").As().Int32Value(); + } + + // enableEndpoint can be either a boolean or an integer + if (config.Has("enableEndpoint") && + (config.Get("enableEndpoint").IsNumber() || + config.Get("enableEndpoint").IsBoolean())) { + if (config.Get("enableEndpoint").IsNumber()) { + c.enable_endpoint = + config.Get("enableEndpoint").As().Int32Value(); + } else { + c.enable_endpoint = + config.Get("enableEndpoint").As().Value(); + } + } + + if (config.Has("rule1MinTrailingSilence") && + config.Get("rule1MinTrailingSilence").IsNumber()) { + c.rule1_min_trailing_silence = + config.Get("rule1MinTrailingSilence").As().FloatValue(); + } + + if (config.Has("rule2MinTrailingSilence") && + config.Get("rule2MinTrailingSilence").IsNumber()) { + c.rule2_min_trailing_silence = + config.Get("rule2MinTrailingSilence").As().FloatValue(); + } + + if (config.Has("rule3MinUtteranceLength") && + config.Get("rule3MinUtteranceLength").IsNumber()) { + c.rule3_min_utterance_length = + config.Get("rule3MinUtteranceLength").As().FloatValue(); + } + + if (config.Has("hotwordsFile") && config.Get("hotwordsFile").IsString()) { + Napi::String hotwords_file = config.Get("hotwordsFile").As(); + std::string s = hotwords_file.Utf8Value(); + char *p = new char[s.size() + 1]; + std::copy(s.begin(), s.end(), p); + p[s.size()] = 0; + + c.hotwords_file = p; + } + + if (config.Has("hotwordsScore") && config.Get("hotwordsScore").IsNumber()) { + c.hotwords_score = + config.Get("hotwordsScore").As().FloatValue(); + } + #if 0 printf("encoder: %s\n", c.model_config.transducer.encoder ? c.model_config.transducer.encoder @@ -184,6 +247,15 @@ static Napi::External CreateOnlineRecognizerWrapper( printf("debug: %d\n", c.model_config.debug); printf("model_type: %s\n", c.model_config.model_type ? c.model_config.model_type : "no"); + + printf("decoding_method: %s\n", c.decoding_method ? c.decoding_method : "no"); + printf("max_active_paths: %d\n", c.max_active_paths); + printf("enable_endpoint: %d\n", c.enable_endpoint); + printf("rule1_min_trailing_silence: %.3f\n", c.rule1_min_trailing_silence); + printf("rule2_min_trailing_silence: %.3f\n", c.rule2_min_trailing_silence); + printf("rule3_min_utterance_length: %.3f\n", c.rule3_min_utterance_length); + printf("hotwords_file: %s\n", c.hotwords_file ? c.hotwords_file : "no"); + printf("hotwords_score: %.3f\n", c.hotwords_score); #endif SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&c); @@ -212,6 +284,14 @@ static Napi::External CreateOnlineRecognizerWrapper( delete[] c.model_config.model_type; } + if (c.decoding_method) { + delete[] c.decoding_method; + } + + if (c.hotwords_file) { + delete[] c.hotwords_file; + } + if (!recognizer) { Napi::TypeError::New(env, "Please check your config!") .ThrowAsJavaScriptException(); @@ -270,7 +350,7 @@ static void AcceptWaveformWrapper(const Napi::CallbackInfo &info) { } if (!info[0].IsExternal()) { - Napi::TypeError::New(env, "Argument 0 should be a online stream pointer.") + Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.") .ThrowAsJavaScriptException(); return; @@ -337,15 +417,14 @@ static Napi::Boolean IsOnlineStreamReadyWrapper( if (!info[0].IsExternal()) { Napi::TypeError::New(env, - "Argument 0 should be a online recognizer pointer.") + "Argument 0 should be an online recognizer pointer.") .ThrowAsJavaScriptException(); return {}; } if (!info[1].IsExternal()) { - Napi::TypeError::New(env, - "Argument 1 should be a online recognizer pointer.") + Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.") .ThrowAsJavaScriptException(); return {}; @@ -375,15 +454,14 @@ static void DecodeOnlineStreamWrapper(const Napi::CallbackInfo &info) { if (!info[0].IsExternal()) { Napi::TypeError::New(env, - "Argument 0 should be a online recognizer pointer.") + "Argument 0 should be an online recognizer pointer.") .ThrowAsJavaScriptException(); return; } if (!info[1].IsExternal()) { - Napi::TypeError::New(env, - "Argument 1 should be a online recognizer pointer.") + Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.") .ThrowAsJavaScriptException(); return; @@ -412,15 +490,14 @@ static Napi::String GetOnlineStreamResultAsJsonWrapper( if (!info[0].IsExternal()) { Napi::TypeError::New(env, - "Argument 0 should be a online recognizer pointer.") + "Argument 0 should be an online recognizer pointer.") .ThrowAsJavaScriptException(); return {}; } if (!info[1].IsExternal()) { - Napi::TypeError::New(env, - "Argument 1 should be a online recognizer pointer.") + Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.") .ThrowAsJavaScriptException(); return {}; @@ -440,6 +517,175 @@ static Napi::String GetOnlineStreamResultAsJsonWrapper( return s; } +static void InputFinishedWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + SherpaOnnxOnlineStream *stream = + info[0].As>().Data(); + + InputFinished(stream); +} + +static void ResetOnlineStreamWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 2 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, + "Argument 0 should be an online recognizer pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + if (!info[1].IsExternal()) { + Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + SherpaOnnxOnlineRecognizer *recognizer = + info[0].As>().Data(); + + SherpaOnnxOnlineStream *stream = + info[1].As>().Data(); + + Reset(recognizer, stream); +} + +static Napi::Boolean IsEndpointWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 2 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, + "Argument 0 should be an online recognizer pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[1].IsExternal()) { + Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + SherpaOnnxOnlineRecognizer *recognizer = + info[0].As>().Data(); + + SherpaOnnxOnlineStream *stream = + info[1].As>().Data(); + + int32_t is_endpoint = IsEndpoint(recognizer, stream); + + return Napi::Boolean::New(env, is_endpoint); +} + +static Napi::External CreateDisplayWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsNumber()) { + Napi::TypeError::New(env, "Expect a number as the argument") + .ThrowAsJavaScriptException(); + + return {}; + } + int32_t max_word_per_line = info[0].As().Int32Value(); + + const SherpaOnnxDisplay *display = CreateDisplay(max_word_per_line); + + return Napi::External::New( + env, const_cast(display), + [](Napi::Env env, SherpaOnnxDisplay *display) { + DestroyDisplay(display); + }); +} + +static void PrintWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 3) { + std::ostringstream os; + os << "Expect only 3 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.") + .ThrowAsJavaScriptException(); + + return; + } + + if (!info[1].IsNumber()) { + Napi::TypeError::New(env, "Argument 1 should be a number.") + .ThrowAsJavaScriptException(); + + return; + } + + if (!info[2].IsString()) { + Napi::TypeError::New(env, "Argument 2 should be a string.") + .ThrowAsJavaScriptException(); + + return; + } + + SherpaOnnxDisplay *display = + info[0].As>().Data(); + + int32_t idx = info[1].As().Int32Value(); + + Napi::String text = info[2].As(); + std::string s = text.Utf8Value(); + SherpaOnnxPrint(display, idx, s.c_str()); +} + void InitStreamingAsr(Napi::Env env, Napi::Object exports) { exports.Set(Napi::String::New(env, "createOnlineRecognizer"), Napi::Function::New(env, CreateOnlineRecognizerWrapper)); @@ -458,4 +704,19 @@ void InitStreamingAsr(Napi::Env env, Napi::Object exports) { exports.Set(Napi::String::New(env, "getOnlineStreamResultAsJson"), Napi::Function::New(env, GetOnlineStreamResultAsJsonWrapper)); + + exports.Set(Napi::String::New(env, "inputFinished"), + Napi::Function::New(env, InputFinishedWrapper)); + + exports.Set(Napi::String::New(env, "reset"), + Napi::Function::New(env, ResetOnlineStreamWrapper)); + + exports.Set(Napi::String::New(env, "isEndpoint"), + Napi::Function::New(env, IsEndpointWrapper)); + + exports.Set(Napi::String::New(env, "createDisplay"), + Napi::Function::New(env, CreateDisplayWrapper)); + + exports.Set(Napi::String::New(env, "print"), + Napi::Function::New(env, PrintWrapper)); }