Add more streaming ASR methods for node-addon-api (#860)
This commit is contained in:
@@ -28,9 +28,13 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PAT
|
||||
```
|
||||
|
||||
## Streaming speech recognition with zipformer transducer
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2
|
||||
|
||||
node ./test_asr_streaming_transducer.js
|
||||
|
||||
node ./test_asr_streaming_transducer_microphone.js
|
||||
```
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"sherpa-onnx-node": "*",
|
||||
"perf_hooks": "*"
|
||||
"naudiodon2": "^2.4.0",
|
||||
"perf_hooks": "*",
|
||||
"sherpa-onnx-node": "*"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const portAudio = require('naudiodon2');
|
||||
// console.log(portAudio.getDevices());
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx-node');
|
||||
|
||||
function createOnlineRecognizer() {
|
||||
const config = {
|
||||
'featConfig': {
|
||||
'sampleRate': 16000,
|
||||
'featureDim': 80,
|
||||
},
|
||||
'modelConfig': {
|
||||
'transducer': {
|
||||
'encoder':
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx',
|
||||
'decoder':
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx',
|
||||
'joiner':
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx',
|
||||
},
|
||||
'tokens':
|
||||
'./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt',
|
||||
'numThreads': 2,
|
||||
'provider': 'cpu',
|
||||
'debug': 1,
|
||||
'modelType': 'zipformer',
|
||||
},
|
||||
'decodingMethod': 'greedy_search',
|
||||
'maxActivePaths': 4,
|
||||
'enableEndpoint': true,
|
||||
'rule1MinTrailingSilence': 2.4,
|
||||
'rule2MinTrailingSilence': 1.2,
|
||||
'rule3MinUtteranceLength': 20
|
||||
};
|
||||
|
||||
return new sherpa_onnx.OnlineRecognizer(config);
|
||||
}
|
||||
|
||||
const recognizer = createOnlineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
let lastText = '';
|
||||
let segmentIndex = 0;
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: recognizer.config.featConfig.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
const display = new sherpa_onnx.Display(50);
|
||||
|
||||
ai.on('data', data => {
|
||||
const samples = new Float32Array(data.buffer);
|
||||
|
||||
stream.acceptWaveform(samples, recognizer.config.featConfig.sampleRate);
|
||||
|
||||
while (recognizer.isReady(stream)) {
|
||||
recognizer.decode(stream);
|
||||
}
|
||||
|
||||
const isEndpoint = recognizer.isEndpoint(stream);
|
||||
const text = recognizer.getResult(stream).text;
|
||||
|
||||
if (text.length > 0 && lastText != text) {
|
||||
lastText = text;
|
||||
display.print(segmentIndex, lastText);
|
||||
}
|
||||
if (isEndpoint) {
|
||||
if (text.length > 0) {
|
||||
lastText = text;
|
||||
segmentIndex += 1;
|
||||
}
|
||||
recognizer.reset(stream)
|
||||
}
|
||||
});
|
||||
|
||||
ai.on('close', () => {
|
||||
console.log('Free resources');
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
@@ -4,4 +4,5 @@ const streaming_asr = require('./streaming-asr.js');
|
||||
module.exports = {
|
||||
OnlineRecognizer: streaming_asr.OnlineRecognizer,
|
||||
readWave: addon.readWave,
|
||||
Display: streaming_asr.Display,
|
||||
}
|
||||
|
||||
@@ -1,5 +1,15 @@
|
||||
const addon = require('./addon.js');
|
||||
|
||||
class Display {
|
||||
constructor(maxWordPerline) {
|
||||
this.handle = addon.createDisplay(maxWordPerline);
|
||||
}
|
||||
|
||||
print(idx, text) {
|
||||
addon.print(this.handle, idx, text)
|
||||
}
|
||||
}
|
||||
|
||||
class OnlineStream {
|
||||
constructor(handle) {
|
||||
this.handle = handle;
|
||||
@@ -10,11 +20,16 @@ class OnlineStream {
|
||||
addon.acceptWaveformOnline(
|
||||
this.handle, {samples: samples, sampleRate: sampleRate})
|
||||
}
|
||||
|
||||
inputFinished() {
|
||||
addon.inputFinished(this.handle)
|
||||
}
|
||||
}
|
||||
|
||||
class OnlineRecognizer {
|
||||
constructor(config) {
|
||||
this.handle = addon.createOnlineRecognizer(config);
|
||||
this.config = config
|
||||
}
|
||||
|
||||
createStream() {
|
||||
@@ -30,6 +45,14 @@ class OnlineRecognizer {
|
||||
addon.decodeOnlineStream(this.handle, stream.handle);
|
||||
}
|
||||
|
||||
isEndpoint(stream) {
|
||||
return addon.isEndpoint(this.handle, stream.handle);
|
||||
}
|
||||
|
||||
reset(stream) {
|
||||
addon.reset(this.handle, stream.handle);
|
||||
}
|
||||
|
||||
getResult(stream) {
|
||||
const jsonStr =
|
||||
addon.getOnlineStreamResultAsJson(this.handle, stream.handle);
|
||||
@@ -38,4 +61,7 @@ class OnlineRecognizer {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {OnlineRecognizer}
|
||||
module.exports = {
|
||||
OnlineRecognizer,
|
||||
Display
|
||||
}
|
||||
|
||||
@@ -166,6 +166,69 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper(
|
||||
memset(&c, 0, sizeof(c));
|
||||
c.feat_config = GetFeatureConfig(config);
|
||||
c.model_config = GetOnlineModelConfig(config);
|
||||
|
||||
if (config.Has("decodingMethod") && config.Get("decodingMethod").IsString()) {
|
||||
Napi::String decoding_method =
|
||||
config.Get("decodingMethod").As<Napi::String>();
|
||||
std::string s = decoding_method.Utf8Value();
|
||||
char *p = new char[s.size() + 1];
|
||||
std::copy(s.begin(), s.end(), p);
|
||||
p[s.size()] = 0;
|
||||
|
||||
c.decoding_method = p;
|
||||
}
|
||||
|
||||
if (config.Has("maxActivePaths") && config.Get("maxActivePaths").IsNumber()) {
|
||||
c.max_active_paths =
|
||||
config.Get("maxActivePaths").As<Napi::Number>().Int32Value();
|
||||
}
|
||||
|
||||
// enableEndpoint can be either a boolean or an integer
|
||||
if (config.Has("enableEndpoint") &&
|
||||
(config.Get("enableEndpoint").IsNumber() ||
|
||||
config.Get("enableEndpoint").IsBoolean())) {
|
||||
if (config.Get("enableEndpoint").IsNumber()) {
|
||||
c.enable_endpoint =
|
||||
config.Get("enableEndpoint").As<Napi::Number>().Int32Value();
|
||||
} else {
|
||||
c.enable_endpoint =
|
||||
config.Get("enableEndpoint").As<Napi::Boolean>().Value();
|
||||
}
|
||||
}
|
||||
|
||||
if (config.Has("rule1MinTrailingSilence") &&
|
||||
config.Get("rule1MinTrailingSilence").IsNumber()) {
|
||||
c.rule1_min_trailing_silence =
|
||||
config.Get("rule1MinTrailingSilence").As<Napi::Number>().FloatValue();
|
||||
}
|
||||
|
||||
if (config.Has("rule2MinTrailingSilence") &&
|
||||
config.Get("rule2MinTrailingSilence").IsNumber()) {
|
||||
c.rule2_min_trailing_silence =
|
||||
config.Get("rule2MinTrailingSilence").As<Napi::Number>().FloatValue();
|
||||
}
|
||||
|
||||
if (config.Has("rule3MinUtteranceLength") &&
|
||||
config.Get("rule3MinUtteranceLength").IsNumber()) {
|
||||
c.rule3_min_utterance_length =
|
||||
config.Get("rule3MinUtteranceLength").As<Napi::Number>().FloatValue();
|
||||
}
|
||||
|
||||
if (config.Has("hotwordsFile") && config.Get("hotwordsFile").IsString()) {
|
||||
Napi::String hotwords_file = config.Get("hotwordsFile").As<Napi::String>();
|
||||
std::string s = hotwords_file.Utf8Value();
|
||||
char *p = new char[s.size() + 1];
|
||||
std::copy(s.begin(), s.end(), p);
|
||||
p[s.size()] = 0;
|
||||
|
||||
c.hotwords_file = p;
|
||||
}
|
||||
|
||||
if (config.Has("hotwordsScore") && config.Get("hotwordsScore").IsNumber()) {
|
||||
c.hotwords_score =
|
||||
config.Get("hotwordsScore").As<Napi::Number>().FloatValue();
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("encoder: %s\n", c.model_config.transducer.encoder
|
||||
? c.model_config.transducer.encoder
|
||||
@@ -184,6 +247,15 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper(
|
||||
printf("debug: %d\n", c.model_config.debug);
|
||||
printf("model_type: %s\n",
|
||||
c.model_config.model_type ? c.model_config.model_type : "no");
|
||||
|
||||
printf("decoding_method: %s\n", c.decoding_method ? c.decoding_method : "no");
|
||||
printf("max_active_paths: %d\n", c.max_active_paths);
|
||||
printf("enable_endpoint: %d\n", c.enable_endpoint);
|
||||
printf("rule1_min_trailing_silence: %.3f\n", c.rule1_min_trailing_silence);
|
||||
printf("rule2_min_trailing_silence: %.3f\n", c.rule2_min_trailing_silence);
|
||||
printf("rule3_min_utterance_length: %.3f\n", c.rule3_min_utterance_length);
|
||||
printf("hotwords_file: %s\n", c.hotwords_file ? c.hotwords_file : "no");
|
||||
printf("hotwords_score: %.3f\n", c.hotwords_score);
|
||||
#endif
|
||||
|
||||
SherpaOnnxOnlineRecognizer *recognizer = CreateOnlineRecognizer(&c);
|
||||
@@ -212,6 +284,14 @@ static Napi::External<SherpaOnnxOnlineRecognizer> CreateOnlineRecognizerWrapper(
|
||||
delete[] c.model_config.model_type;
|
||||
}
|
||||
|
||||
if (c.decoding_method) {
|
||||
delete[] c.decoding_method;
|
||||
}
|
||||
|
||||
if (c.hotwords_file) {
|
||||
delete[] c.hotwords_file;
|
||||
}
|
||||
|
||||
if (!recognizer) {
|
||||
Napi::TypeError::New(env, "Please check your config!")
|
||||
.ThrowAsJavaScriptException();
|
||||
@@ -270,7 +350,7 @@ static void AcceptWaveformWrapper(const Napi::CallbackInfo &info) {
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env, "Argument 0 should be a online stream pointer.")
|
||||
Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
@@ -337,15 +417,14 @@ static Napi::Boolean IsOnlineStreamReadyWrapper(
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 0 should be a online recognizer pointer.")
|
||||
"Argument 0 should be an online recognizer pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[1].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 1 should be a online recognizer pointer.")
|
||||
Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
@@ -375,15 +454,14 @@ static void DecodeOnlineStreamWrapper(const Napi::CallbackInfo &info) {
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 0 should be a online recognizer pointer.")
|
||||
"Argument 0 should be an online recognizer pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[1].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 1 should be a online recognizer pointer.")
|
||||
Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
@@ -412,15 +490,14 @@ static Napi::String GetOnlineStreamResultAsJsonWrapper(
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 0 should be a online recognizer pointer.")
|
||||
"Argument 0 should be an online recognizer pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[1].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 1 should be a online recognizer pointer.")
|
||||
Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
@@ -440,6 +517,175 @@ static Napi::String GetOnlineStreamResultAsJsonWrapper(
|
||||
return s;
|
||||
}
|
||||
|
||||
static void InputFinishedWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (info.Length() != 1) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 1 arguments. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
SherpaOnnxOnlineStream *stream =
|
||||
info[0].As<Napi::External<SherpaOnnxOnlineStream>>().Data();
|
||||
|
||||
InputFinished(stream);
|
||||
}
|
||||
|
||||
static void ResetOnlineStreamWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 2) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 2 arguments. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 0 should be an online recognizer pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[1].IsExternal()) {
|
||||
Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
SherpaOnnxOnlineRecognizer *recognizer =
|
||||
info[0].As<Napi::External<SherpaOnnxOnlineRecognizer>>().Data();
|
||||
|
||||
SherpaOnnxOnlineStream *stream =
|
||||
info[1].As<Napi::External<SherpaOnnxOnlineStream>>().Data();
|
||||
|
||||
Reset(recognizer, stream);
|
||||
}
|
||||
|
||||
static Napi::Boolean IsEndpointWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 2) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 2 arguments. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env,
|
||||
"Argument 0 should be an online recognizer pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[1].IsExternal()) {
|
||||
Napi::TypeError::New(env, "Argument 1 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
SherpaOnnxOnlineRecognizer *recognizer =
|
||||
info[0].As<Napi::External<SherpaOnnxOnlineRecognizer>>().Data();
|
||||
|
||||
SherpaOnnxOnlineStream *stream =
|
||||
info[1].As<Napi::External<SherpaOnnxOnlineStream>>().Data();
|
||||
|
||||
int32_t is_endpoint = IsEndpoint(recognizer, stream);
|
||||
|
||||
return Napi::Boolean::New(env, is_endpoint);
|
||||
}
|
||||
|
||||
static Napi::External<SherpaOnnxDisplay> CreateDisplayWrapper(
|
||||
const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() != 1) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 1 argument. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
if (!info[0].IsNumber()) {
|
||||
Napi::TypeError::New(env, "Expect a number as the argument")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return {};
|
||||
}
|
||||
int32_t max_word_per_line = info[0].As<Napi::Number>().Int32Value();
|
||||
|
||||
const SherpaOnnxDisplay *display = CreateDisplay(max_word_per_line);
|
||||
|
||||
return Napi::External<SherpaOnnxDisplay>::New(
|
||||
env, const_cast<SherpaOnnxDisplay *>(display),
|
||||
[](Napi::Env env, SherpaOnnxDisplay *display) {
|
||||
DestroyDisplay(display);
|
||||
});
|
||||
}
|
||||
|
||||
static void PrintWrapper(const Napi::CallbackInfo &info) {
|
||||
Napi::Env env = info.Env();
|
||||
|
||||
if (info.Length() != 3) {
|
||||
std::ostringstream os;
|
||||
os << "Expect only 3 arguments. Given: " << info.Length();
|
||||
|
||||
Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[0].IsExternal()) {
|
||||
Napi::TypeError::New(env, "Argument 0 should be an online stream pointer.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[1].IsNumber()) {
|
||||
Napi::TypeError::New(env, "Argument 1 should be a number.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!info[2].IsString()) {
|
||||
Napi::TypeError::New(env, "Argument 2 should be a string.")
|
||||
.ThrowAsJavaScriptException();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
SherpaOnnxDisplay *display =
|
||||
info[0].As<Napi::External<SherpaOnnxDisplay>>().Data();
|
||||
|
||||
int32_t idx = info[1].As<Napi::Number>().Int32Value();
|
||||
|
||||
Napi::String text = info[2].As<Napi::String>();
|
||||
std::string s = text.Utf8Value();
|
||||
SherpaOnnxPrint(display, idx, s.c_str());
|
||||
}
|
||||
|
||||
void InitStreamingAsr(Napi::Env env, Napi::Object exports) {
|
||||
exports.Set(Napi::String::New(env, "createOnlineRecognizer"),
|
||||
Napi::Function::New(env, CreateOnlineRecognizerWrapper));
|
||||
@@ -458,4 +704,19 @@ void InitStreamingAsr(Napi::Env env, Napi::Object exports) {
|
||||
|
||||
exports.Set(Napi::String::New(env, "getOnlineStreamResultAsJson"),
|
||||
Napi::Function::New(env, GetOnlineStreamResultAsJsonWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "inputFinished"),
|
||||
Napi::Function::New(env, InputFinishedWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "reset"),
|
||||
Napi::Function::New(env, ResetOnlineStreamWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "isEndpoint"),
|
||||
Napi::Function::New(env, IsEndpointWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "createDisplay"),
|
||||
Napi::Function::New(env, CreateDisplayWrapper));
|
||||
|
||||
exports.Set(Napi::String::New(env, "print"),
|
||||
Napi::Function::New(env, PrintWrapper));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user