Add streaming CTC ASR APIs for node-addon-api (#867)

2024-05-13 11:58:25 +08:00
parent db85b2c1d8
commit 384f96c40f
15 changed files with 443 additions and 29 deletions
--- a/nodejs-addon-examples/README.md
+++ b/nodejs-addon-examples/README.md
@@ -27,6 +27,18 @@ export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-x64:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$PWD/node_modules/sherpa-onnx-linux-arm64:$LD_LIBRARY_PATH
 ```

+# Voice Activity detection (VAD)
+
+```bash
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+
+# To run the test with a microphone, you need to install the package naudiodon2
+npm install naudiodon2
+
+node ./test_vad_microphone.js
+```
+
 ## Streaming speech recognition with zipformer transducer

 ```bash
@@ -36,21 +48,27 @@ rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2

 node ./test_asr_streaming_transducer.js

-# To run the test with microphone, you need to install the package naudiodon2
+# To run the test with a microphone, you need to install the package naudiodon2
 npm install naudiodon2

 node ./test_asr_streaming_transducer_microphone.js
 ```

-# VAD
+## Streaming speech recognition with zipformer CTC

 ```bash
-wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
+tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
+rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2

+node ./test_asr_streaming_ctc.js

-# To run the test with microphone, you need to install the package naudiodon2
+# To decode with HLG.fst
+node ./test_asr_streaming_ctc_hlg.js
+
+# To run the test with a microphone, you need to install the package naudiodon2
 npm install naudiodon2

-node ./test_vad_microphone.js
+node ./test_asr_streaming_ctc_microphone.js
+node ./test_asr_streaming_ctc_hlg_microphone.js
 ```
-
--- a/nodejs-addon-examples/test_asr_streaming_ctc.js
+++ b/nodejs-addon-examples/test_asr_streaming_ctc.js
@@ -0,0 +1,55 @@
+// Copyright (c)  2024  Xiaomi Corporation
+const sherpa_onnx = require('sherpa-onnx-node');
+const performance = require('perf_hooks').performance;
+
+
+// Please download test files from
+// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+const config = {
+  'featConfig': {
+    'sampleRate': 16000,
+    'featureDim': 80,
+  },
+  'modelConfig': {
+    'zipformer2Ctc': {
+      'model':
+          './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
+    },
+    'tokens':
+        './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
+    'numThreads': 2,
+    'provider': 'cpu',
+    'debug': 1,
+  }
+};
+
+const waveFilename =
+    './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/0.wav';
+
+const recognizer = new sherpa_onnx.OnlineRecognizer(config);
+console.log('Started')
+let start = performance.now();
+const stream = recognizer.createStream();
+const wave = sherpa_onnx.readWave(waveFilename);
+stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
+
+const tailPadding = new Float32Array(wave.sampleRate * 0.4);
+stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate});
+
+while (recognizer.isReady(stream)) {
+  recognizer.decode(stream);
+}
+result = recognizer.getResult(stream)
+let stop = performance.now();
+console.log('Done')
+
+const elapsed_seconds = (stop - start) / 1000;
+const duration = wave.samples.length / wave.sampleRate;
+const real_time_factor = elapsed_seconds / duration;
+console.log('Wave duration', duration.toFixed(3), 'secodns')
+console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
+console.log(waveFilename)
+console.log('result\n', result)
--- a/nodejs-addon-examples/test_asr_streaming_ctc_hlg.js
+++ b/nodejs-addon-examples/test_asr_streaming_ctc_hlg.js
@@ -0,0 +1,58 @@
+// Copyright (c)  2024  Xiaomi Corporation
+const sherpa_onnx = require('sherpa-onnx-node');
+const performance = require('perf_hooks').performance;
+
+
+// Please download test files from
+// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+const config = {
+  'featConfig': {
+    'sampleRate': 16000,
+    'featureDim': 80,
+  },
+  'modelConfig': {
+    'zipformer2Ctc': {
+      'model':
+          './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
+    },
+    'tokens':
+        './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
+    'numThreads': 2,
+    'provider': 'cpu',
+    'debug': 1,
+  },
+  'ctcFstDecoderConfig': {
+    'graph': './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst',
+  },
+};
+
+const waveFilename =
+    './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/1.wav';
+
+const recognizer = new sherpa_onnx.OnlineRecognizer(config);
+console.log('Started')
+let start = performance.now();
+const stream = recognizer.createStream();
+const wave = sherpa_onnx.readWave(waveFilename);
+stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
+
+const tailPadding = new Float32Array(wave.sampleRate * 0.4);
+stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate});
+
+while (recognizer.isReady(stream)) {
+  recognizer.decode(stream);
+}
+result = recognizer.getResult(stream)
+let stop = performance.now();
+console.log('Done')
+
+const elapsed_seconds = (stop - start) / 1000;
+const duration = wave.samples.length / wave.sampleRate;
+const real_time_factor = elapsed_seconds / duration;
+console.log('Wave duration', duration.toFixed(3), 'secodns')
+console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
+console.log(waveFilename)
+console.log('result\n', result)
--- a/nodejs-addon-examples/test_asr_streaming_ctc_hlg_microphone.js
+++ b/nodejs-addon-examples/test_asr_streaming_ctc_hlg_microphone.js
@@ -0,0 +1,89 @@
+// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+//
+const portAudio = require('naudiodon2');
+// console.log(portAudio.getDevices());
+
+const sherpa_onnx = require('sherpa-onnx-node');
+
+function createOnlineRecognizer() {
+  const config = {
+    'featConfig': {
+      'sampleRate': 16000,
+      'featureDim': 80,
+    },
+    'modelConfig': {
+      'zipformer2Ctc': {
+        'model':
+            './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
+      },
+      'tokens':
+          './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
+      'numThreads': 2,
+      'provider': 'cpu',
+      'debug': 1,
+    },
+    'ctcFstDecoderConfig': {
+      'graph': './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst',
+    },
+    'enableEndpoint': true,
+    'rule1MinTrailingSilence': 2.4,
+    'rule2MinTrailingSilence': 1.2,
+    'rule3MinUtteranceLength': 20
+  };
+
+  return new sherpa_onnx.OnlineRecognizer(config);
+}
+
+const recognizer = createOnlineRecognizer();
+const stream = recognizer.createStream();
+
+let lastText = '';
+let segmentIndex = 0;
+
+const ai = new portAudio.AudioIO({
+  inOptions: {
+    channelCount: 1,
+    closeOnError: true,  // Close the stream if an audio error is detected, if
+                         // set false then just log the error
+    deviceId: -1,  // Use -1 or omit the deviceId to select the default device
+    sampleFormat: portAudio.SampleFormatFloat32,
+    sampleRate: recognizer.config.featConfig.sampleRate
+  }
+});
+
+const display = new sherpa_onnx.Display(50);
+
+ai.on('data', data => {
+  const samples = new Float32Array(data.buffer);
+
+  stream.acceptWaveform(
+      {sampleRate: recognizer.config.featConfig.sampleRate, samples: samples});
+
+  while (recognizer.isReady(stream)) {
+    recognizer.decode(stream);
+  }
+
+  const isEndpoint = recognizer.isEndpoint(stream);
+  const text = recognizer.getResult(stream).text.toLowerCase();
+
+  if (text.length > 0 && lastText != text) {
+    lastText = text;
+    display.print(segmentIndex, lastText);
+  }
+  if (isEndpoint) {
+    if (text.length > 0) {
+      lastText = text;
+      segmentIndex += 1;
+    }
+    recognizer.reset(stream)
+  }
+});
+
+ai.on('close', () => {
+  console.log('Free resources');
+  stream.free();
+  recognizer.free();
+});
+
+ai.start();
+console.log('Started! Please speak')
--- a/nodejs-addon-examples/test_asr_streaming_ctc_microphone.js
+++ b/nodejs-addon-examples/test_asr_streaming_ctc_microphone.js
@@ -0,0 +1,88 @@
+// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+//
+const portAudio = require('naudiodon2');
+// console.log(portAudio.getDevices());
+
+const sherpa_onnx = require('sherpa-onnx-node');
+
+function createOnlineRecognizer() {
+  const config = {
+    'featConfig': {
+      'sampleRate': 16000,
+      'featureDim': 80,
+    },
+    'modelConfig': {
+      'zipformer2Ctc': {
+        'model':
+            './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx',
+      },
+      'tokens':
+          './sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt',
+      'numThreads': 2,
+      'provider': 'cpu',
+      'debug': 1,
+    },
+    'decodingMethod': 'greedy_search',
+    'maxActivePaths': 4,
+    'enableEndpoint': true,
+    'rule1MinTrailingSilence': 2.4,
+    'rule2MinTrailingSilence': 1.2,
+    'rule3MinUtteranceLength': 20
+  };
+
+  return new sherpa_onnx.OnlineRecognizer(config);
+}
+
+const recognizer = createOnlineRecognizer();
+const stream = recognizer.createStream();
+
+let lastText = '';
+let segmentIndex = 0;
+
+const ai = new portAudio.AudioIO({
+  inOptions: {
+    channelCount: 1,
+    closeOnError: true,  // Close the stream if an audio error is detected, if
+                         // set false then just log the error
+    deviceId: -1,  // Use -1 or omit the deviceId to select the default device
+    sampleFormat: portAudio.SampleFormatFloat32,
+    sampleRate: recognizer.config.featConfig.sampleRate
+  }
+});
+
+const display = new sherpa_onnx.Display(50);
+
+ai.on('data', data => {
+  const samples = new Float32Array(data.buffer);
+
+  stream.acceptWaveform(
+      {sampleRate: recognizer.config.featConfig.sampleRate, samples: samples});
+
+  while (recognizer.isReady(stream)) {
+    recognizer.decode(stream);
+  }
+
+  const isEndpoint = recognizer.isEndpoint(stream);
+  const text = recognizer.getResult(stream).text.toLowerCase();
+
+  if (text.length > 0 && lastText != text) {
+    lastText = text;
+    display.print(segmentIndex, lastText);
+  }
+  if (isEndpoint) {
+    if (text.length > 0) {
+      lastText = text;
+      segmentIndex += 1;
+    }
+    recognizer.reset(stream)
+  }
+});
+
+ai.on('close', () => {
+  console.log('Free resources');
+  stream.free();
+  recognizer.free();
+});
+
+ai.start();
+console.log('Started! Please speak')
--- a/nodejs-addon-examples/test_asr_streaming_transducer.js
+++ b/nodejs-addon-examples/test_asr_streaming_transducer.js
@@ -24,7 +24,6 @@ const config = {
    'numThreads': 2,
    'provider': 'cpu',
    'debug': 1,
-    'modelType': 'zipformer',
  }
 };

@@ -53,5 +52,8 @@ const duration = wave.samples.length / wave.sampleRate;
 const real_time_factor = elapsed_seconds / duration;
 console.log('Wave duration', duration.toFixed(3), 'secodns')
 console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
-console.log('RTF', real_time_factor.toFixed(3))
-console.log('result', result.text)
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
+console.log(waveFilename)
+console.log('result\n', result)
--- a/nodejs-addon-examples/test_asr_streaming_transducer_microphone.js
+++ b/nodejs-addon-examples/test_asr_streaming_transducer_microphone.js
@@ -25,7 +25,6 @@ function createOnlineRecognizer() {
      'numThreads': 2,
      'provider': 'cpu',
      'debug': 1,
-      'modelType': 'zipformer',
    },
    'decodingMethod': 'greedy_search',
    'maxActivePaths': 4,
@@ -68,7 +67,7 @@ ai.on('data', data => {
  }

  const isEndpoint = recognizer.isEndpoint(stream);
-  const text = recognizer.getResult(stream).text;
+  const text = recognizer.getResult(stream).text.toLowerCase();

  if (text.length > 0 && lastText != text) {
    lastText = text;