Add JavaScript API for Moonshine models (#1480)

2024-10-27 11:31:01 +08:00
parent 3d3edabb5f
commit 6f261d39f3
13 changed files with 719 additions and 88 deletions
--- a/nodejs-examples/README.md
+++ b/nodejs-examples/README.md
@@ -133,7 +133,25 @@ tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2
 node ./test-offline-transducer.js
 ```

+## ./test-vad-with-non-streaming-asr-whisper.js
+
+[./test-vad-with-non-streaming-asr-whisper.js](./test-vad-with-non-streaming-asr-whisper.js)
+shows how to use VAD + whisper to decode a very long file.
+
+You can use the following command to run it:
+
+```bash
+wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+node ./test-vad-with-non-streaming-asr-whisper.js
+```
+
 ## ./test-offline-whisper.js
+
 [./test-offline-whisper.js](./test-offline-whisper.js) demonstrates
 how to decode a file with a Whisper model. In the code we use
 [sherpa-onnx-whisper-tiny.en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html).
@@ -146,7 +164,40 @@ tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
 node ./test-offline-whisper.js
 ```

+## ./test-offline-moonshine.js
+
+[./test-offline-moonshine.js](./test-offline-moonshine.js) demonstrates
+how to decode a file with a Moonshine model. In the code we use
+[sherpa-onnx-moonshine-tiny-en-int8](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2).
+
+You can use the following command to run it:
+
+```bash
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+
+node ./test-offline-moonshine.js
+```
+
+## ./test-vad-with-non-streaming-asr-moonshine.js
+
+[./test-vad-with-non-streaming-asr-moonshine.js](./test-vad-with-non-streaming-asr-moonshine.js)
+shows how to use VAD + whisper to decode a very long file.
+
+You can use the following command to run it:
+
+```bash
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+node ./test-vad-with-non-streaming-asr-moonshine.js
+```
+
 ## ./test-online-paraformer-microphone.js
+
 [./test-online-paraformer-microphone.js](./test-online-paraformer-microphone.js)
 demonstrates how to do real-time speech recognition from microphone
 with a streaming Paraformer model. In the code we use
--- a/nodejs-examples/test-offline-moonshine.js
+++ b/nodejs-examples/test-offline-moonshine.js
@@ -0,0 +1,37 @@
+// Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
+//
+const sherpa_onnx = require('sherpa-onnx');
+
+function createOfflineRecognizer() {
+  let modelConfig = {
+    moonshine: {
+      preprocessor: './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx',
+      encoder: './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx',
+      uncachedDecoder:
+          './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx',
+      cachedDecoder:
+          './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx',
+    },
+    tokens: './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt',
+  };
+
+  let config = {
+    modelConfig: modelConfig,
+  };
+
+  return sherpa_onnx.createOfflineRecognizer(config);
+}
+
+recognizer = createOfflineRecognizer();
+stream = recognizer.createStream();
+
+const waveFilename = './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav';
+const wave = sherpa_onnx.readWave(waveFilename);
+stream.acceptWaveform(wave.sampleRate, wave.samples);
+
+recognizer.decode(stream);
+const text = recognizer.getResult(stream).text;
+console.log(text);
+
+stream.free();
+recognizer.free();
--- a/nodejs-examples/test-vad-with-non-streaming-asr-moonshine.js
+++ b/nodejs-examples/test-vad-with-non-streaming-asr-moonshine.js
@@ -0,0 +1,128 @@
+// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+
+const sherpa_onnx = require('sherpa-onnx');
+
+function createRecognizer() {
+  // Please download test files from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+  const config = {
+    'modelConfig': {
+      'moonshine': {
+        'preprocessor': './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx',
+        'encoder': './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx',
+        'uncachedDecoder':
+            './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx',
+        'cachedDecoder':
+            './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx',
+      },
+      'tokens': './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt',
+      'debug': 0,
+    }
+  };
+
+  return sherpa_onnx.createOfflineRecognizer(config);
+}
+
+function createVad() {
+  // please download silero_vad.onnx from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+  const config = {
+    sileroVad: {
+      model: './silero_vad.onnx',
+      threshold: 0.5,
+      minSpeechDuration: 0.25,
+      minSilenceDuration: 0.5,
+      maxSpeechDuration: 5,
+      windowSize: 512,
+    },
+    sampleRate: 16000,
+    debug: true,
+    numThreads: 1,
+    bufferSizeInSeconds: 60,
+  };
+
+  return sherpa_onnx.createVad(config);
+}
+
+const recognizer = createRecognizer();
+const vad = createVad();
+
+// please download ./Obama.wav from
+// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+const waveFilename = './Obama.wav';
+const wave = sherpa_onnx.readWave(waveFilename);
+
+if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
+  throw new Error(
+      'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
+}
+
+console.log('Started')
+let start = Date.now();
+
+const windowSize = vad.config.sileroVad.windowSize;
+for (let i = 0; i < wave.samples.length; i += windowSize) {
+  const thisWindow = wave.samples.subarray(i, i + windowSize);
+  vad.acceptWaveform(thisWindow);
+
+  while (!vad.isEmpty()) {
+    const segment = vad.front();
+    vad.pop();
+
+    let start_time = segment.start / wave.sampleRate;
+    let end_time = start_time + segment.samples.length / wave.sampleRate;
+
+    start_time = start_time.toFixed(2);
+    end_time = end_time.toFixed(2);
+
+    const stream = recognizer.createStream();
+    stream.acceptWaveform(wave.sampleRate, segment.samples);
+
+    recognizer.decode(stream);
+    const r = recognizer.getResult(stream);
+    if (r.text.length > 0) {
+      const text = r.text.toLowerCase().trim();
+      console.log(`${start_time} -- ${end_time}: ${text}`);
+    }
+
+    stream.free();
+  }
+}
+
+vad.flush();
+
+while (!vad.isEmpty()) {
+  const segment = vad.front();
+  vad.pop();
+
+  let start_time = segment.start / wave.sampleRate;
+  let end_time = start_time + segment.samples.length / wave.sampleRate;
+
+  start_time = start_time.toFixed(2);
+  end_time = end_time.toFixed(2);
+
+  const stream = recognizer.createStream();
+  stream.acceptWaveform(wave.sampleRate, segment.samples);
+
+  recognizer.decode(stream);
+  const r = recognizer.getResult(stream);
+  if (r.text.length > 0) {
+    const text = r.text.toLowerCase().trim();
+    console.log(`${start_time} -- ${end_time}: ${text}`);
+  }
+}
+
+let stop = Date.now();
+console.log('Done')
+
+const elapsed_seconds = (stop - start) / 1000;
+const duration = wave.samples.length / wave.sampleRate;
+const real_time_factor = elapsed_seconds / duration;
+console.log('Wave duration', duration.toFixed(3), 'seconds')
+console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
+
+vad.free();
+recognizer.free();