Add VAD + Non-streaming ASR example for JavaScript API. (#1170)

2024-07-26 12:42:08 +08:00
parent 299f1a852b
commit 994c3e7c96
22 changed files with 189 additions and 32 deletions
--- a/.github/scripts/test-nodejs-addon-npm.sh
+++ b/.github/scripts/test-nodejs-addon-npm.sh
@@ -10,6 +10,19 @@ arch=$(node -p "require('os').arch()")
 platform=$(node -p "require('os').platform()")
 node_version=$(node -p "process.versions.node.split('.')[0]")

+echo "----------non-streaming asr + vad----------"
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+rm sherpa-onnx-whisper-tiny.en.tar.bz2
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+node ./test_vad_with_non_streaming_asr_whisper.js
+rm -rf sherpa-onnx-whisper*
+rm *.wav
+rm *.onnx
+
 echo "----------asr----------"

 if [[ $arch != "ia32" && $platform != "win32" ]]; then
--- a/.gitignore
+++ b/.gitignore
@@ -112,3 +112,4 @@ sherpa-onnx-telespeech-ctc-*
 .ccache
 lib*.a
 sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+*.bak
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 1.10.18
+
+* Fix the case when recognition results contain the symbol `"`. It caused
+  issues when converting results to a json string.
+
 ## 1.10.17

 * Support SenseVoice CTC models.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ project(sherpa-onnx)
 # ./nodejs-addon-examples
 # ./dart-api-examples/
 # ./CHANGELOG.md
-set(SHERPA_ONNX_VERSION "1.10.17")
+set(SHERPA_ONNX_VERSION "1.10.18")

 # Disable warning about
 #
--- a/dart-api-examples/keyword-spotter/pubspec.yaml
+++ b/dart-api-examples/keyword-spotter/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
  sdk: ^3.4.0

 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
  # sherpa_onnx:
  #   path: ../../flutter/sherpa_onnx
  path: ^1.9.0
--- a/dart-api-examples/non-streaming-asr/pubspec.yaml
+++ b/dart-api-examples/non-streaming-asr/pubspec.yaml
@@ -10,7 +10,7 @@ environment:

 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
  path: ^1.9.0
  args: ^2.5.0

--- a/dart-api-examples/streaming-asr/pubspec.yaml
+++ b/dart-api-examples/streaming-asr/pubspec.yaml
@@ -11,7 +11,7 @@ environment:

 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
  path: ^1.9.0
  args: ^2.5.0

--- a/dart-api-examples/tts/pubspec.yaml
+++ b/dart-api-examples/tts/pubspec.yaml
@@ -8,7 +8,7 @@ environment:

 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
  path: ^1.9.0
  args: ^2.5.0

--- a/dart-api-examples/vad/pubspec.yaml
+++ b/dart-api-examples/vad/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
  sdk: ^3.4.0

 dependencies:
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
  path: ^1.9.0
  args: ^2.5.0

--- a/flutter-examples/streaming_asr/pubspec.yaml
+++ b/flutter-examples/streaming_asr/pubspec.yaml
@@ -5,7 +5,7 @@ description: >

 publish_to: 'none'

-version: 1.10.17
+version: 1.10.18

 topics:
  - speech-recognition
@@ -30,7 +30,7 @@ dependencies:
  record: ^5.1.0
  url_launcher: ^6.2.6

-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
  # sherpa_onnx:
    # path: ../../flutter/sherpa_onnx

--- a/flutter-examples/tts/pubspec.yaml
+++ b/flutter-examples/tts/pubspec.yaml
@@ -5,7 +5,7 @@ description: >

 publish_to: 'none' # Remove this line if you wish to publish to pub.dev

-version: 1.10.17
+version: 1.10.18

 environment:
  sdk: '>=3.4.0 <4.0.0'
@@ -17,7 +17,7 @@ dependencies:
  cupertino_icons: ^1.0.6
  path_provider: ^2.1.3
  path: ^1.9.0
-  sherpa_onnx: ^1.10.17
+  sherpa_onnx: ^1.10.18
  url_launcher: ^6.2.6
  audioplayers: ^5.0.0

--- a/flutter/sherpa_onnx/pubspec.yaml
+++ b/flutter/sherpa_onnx/pubspec.yaml
@@ -17,7 +17,7 @@ topics:
  - voice-activity-detection

 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
-version: 1.10.17
+version: 1.10.18

 homepage: https://github.com/k2-fsa/sherpa-onnx

@@ -30,23 +30,23 @@ dependencies:
  flutter:
    sdk: flutter

-  sherpa_onnx_android: ^1.10.17
+  sherpa_onnx_android: ^1.10.18
  # sherpa_onnx_android:
  #   path: ../sherpa_onnx_android

-  sherpa_onnx_macos: ^1.10.17
+  sherpa_onnx_macos: ^1.10.18
  # sherpa_onnx_macos:
  #   path: ../sherpa_onnx_macos

-  sherpa_onnx_linux: ^1.10.17
+  sherpa_onnx_linux: ^1.10.18
  # sherpa_onnx_linux:
  #   path: ../sherpa_onnx_linux
    #
-  sherpa_onnx_windows: ^1.10.17
+  sherpa_onnx_windows: ^1.10.18
  # sherpa_onnx_windows:
  #   path: ../sherpa_onnx_windows

-  sherpa_onnx_ios: ^1.10.17
+  sherpa_onnx_ios: ^1.10.18
  # sherpa_onnx_ios:
  #   path: ../sherpa_onnx_ios

--- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
+++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
@@ -7,7 +7,7 @@
 # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
 Pod::Spec.new do |s|
  s.name             = 'sherpa_onnx_ios'
-  s.version          = '1.10.17'
+  s.version          = '1.10.18'
  s.summary          = 'A new Flutter FFI plugin project.'
  s.description      = <<-DESC
 A new Flutter FFI plugin project.
--- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
+++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
  s.name             = 'sherpa_onnx_macos'
-  s.version          = '1.10.17'
+  s.version          = '1.10.18'
  s.summary          = 'sherpa-onnx Flutter FFI plugin project.'
  s.description      = <<-DESC
 sherpa-onnx Flutter FFI plugin project.
--- a/nodejs-addon-examples/README.md
+++ b/nodejs-addon-examples/README.md
@@ -93,6 +93,7 @@ The following tables list the examples in this folder.
 |---|---|
 |[./test_asr_non_streaming_transducer.js](./test_asr_non_streaming_transducer.js)|Non-streaming speech recognition from a file with a Zipformer transducer model|
 |[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
+|[./test_vad_with_non_streaming_asr_whisper.js](./test_vad_with_non_streaming_asr_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper) + [Silero VAD](https://github.com/snakers4/silero-vad)|
 |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
 |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
 |[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
@@ -221,11 +222,24 @@ rm sherpa-onnx-whisper-tiny.en.tar.bz2

 node ./test_asr_non_streaming_whisper.js

-# To run VAD + non-streaming ASR with Paraformer using a microphone
+# To run VAD + non-streaming ASR with Whisper using a microphone
 npm install naudiodon2
 node ./test_vad_asr_non_streaming_whisper_microphone.js
 ```

+### Non-streaming speech recognition with Whisper + VAD
+
+```bash
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
+tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
+rm sherpa-onnx-whisper-tiny.en.tar.bz2
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+
+node ./test_vad_with_non_streaming_asr_whisper.js
+```
+
 ### Non-streaming speech recognition with NeMo CTC models

 ```bash
--- a/nodejs-addon-examples/package.json
+++ b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
  "dependencies": {
-    "sherpa-onnx-node": "^1.10.17"
+    "sherpa-onnx-node": "^1.10.18"
  }
 }
--- a/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js
+++ b/nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js
@@ -0,0 +1,127 @@
+// Copyright (c)  2023-2024  Xiaomi Corporation (authors: Fangjun Kuang)
+
+const sherpa_onnx = require('sherpa-onnx-node');
+
+function createRecognizer() {
+  // Please download test files from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+  const config = {
+    'featConfig': {
+      'sampleRate': 16000,
+      'featureDim': 80,
+    },
+    'modelConfig': {
+      'whisper': {
+        'encoder': './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx',
+        'decoder': './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx',
+      },
+      'tokens': './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt',
+      'numThreads': 2,
+      'provider': 'cpu',
+      'debug': 1,
+    }
+  };
+
+  return new sherpa_onnx.OfflineRecognizer(config);
+}
+
+function createVad() {
+  // please download silero_vad.onnx from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
+  const config = {
+    sileroVad: {
+      model: './silero_vad.onnx',
+      threshold: 0.5,
+      minSpeechDuration: 0.25,
+      minSilenceDuration: 0.5,
+      windowSize: 512,
+    },
+    sampleRate: 16000,
+    debug: true,
+    numThreads: 1,
+  };
+
+  const bufferSizeInSeconds = 60;
+
+  return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
+}
+
+const recognizer = createRecognizer();
+const vad = createVad();
+
+// please download ./Obama.wav from
+// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+const waveFilename = './Obama.wav';
+const wave = sherpa_onnx.readWave(waveFilename);
+
+if (wave.sampleRate != recognizer.config.featConfig.sampleRate) {
+  throw new Error(
+      'Expected sample rate: ${recognizer.config.featConfig.sampleRate}. Given: ${wave.sampleRate}');
+}
+
+console.log('Started')
+let start = Date.now();
+
+const windowSize = vad.config.sileroVad.windowSize;
+for (let i = 0; i < wave.samples.length; i += windowSize) {
+  const thisWindow = wave.samples.subarray(i, i + windowSize);
+  vad.acceptWaveform(thisWindow);
+
+  while (!vad.isEmpty()) {
+    const segment = vad.front();
+    vad.pop();
+
+    let start_time = segment.start / wave.sampleRate;
+    let end_time = start_time + segment.samples.length / wave.sampleRate;
+
+    start_time = start_time.toFixed(2);
+    end_time = end_time.toFixed(2);
+
+    const stream = recognizer.createStream();
+    stream.acceptWaveform(
+        {samples: segment.samples, sampleRate: wave.sampleRate});
+
+    recognizer.decode(stream);
+    const r = recognizer.getResult(stream);
+    if (r.text.length > 0) {
+      const text = r.text.toLowerCase().trim();
+      console.log(`${start_time} -- ${end_time}: ${text}`);
+    }
+  }
+}
+
+vad.flush();
+
+while (!vad.isEmpty()) {
+  const segment = vad.front();
+  vad.pop();
+
+  let start_time = segment.start / wave.sampleRate;
+  let end_time = start_time + segment.samples.length / wave.sampleRate;
+
+  start_time = start_time.toFixed(2);
+  end_time = end_time.toFixed(2);
+
+  const stream = recognizer.createStream();
+  stream.acceptWaveform(
+      {samples: segment.samples, sampleRate: wave.sampleRate});
+
+  recognizer.decode(stream);
+  const r = recognizer.getResult(stream);
+  if (r.text.length > 0) {
+    const text = r.text.toLowerCase().trim();
+    console.log(`${start_time} -- ${end_time}: ${text}`);
+  }
+}
+
+let stop = Date.now();
+console.log('Done')
+
+const elapsed_seconds = (stop - start) / 1000;
+const duration = wave.samples.length / wave.sampleRate;
+const real_time_factor = elapsed_seconds / duration;
+console.log('Wave duration', duration.toFixed(3), 'secodns')
+console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
+console.log(
+    `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
+    real_time_factor.toFixed(3))
--- a/scripts/dart/kws-pubspec.yaml
+++ b/scripts/dart/kws-pubspec.yaml
@@ -9,7 +9,7 @@ environment:
  sdk: ^3.4.0

 dependencies:
-  # sherpa_onnx: ^1.10.17
+  # sherpa_onnx: ^1.10.18
  sherpa_onnx:
    path: ../../flutter/sherpa_onnx
  path: ^1.9.0
--- a/scripts/dart/sherpa-onnx-pubspec.yaml
+++ b/scripts/dart/sherpa-onnx-pubspec.yaml
@@ -17,7 +17,7 @@ topics:
  - voice-activity-detection

 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
-version: 1.10.17
+version: 1.10.18

 homepage: https://github.com/k2-fsa/sherpa-onnx

--- a/scripts/node-addon-api/lib/vad.js
+++ b/scripts/node-addon-api/lib/vad.js
@@ -65,7 +65,7 @@ config = {
  }

  clear() {
-    addon.VoiceActivityDetectorClearWrapper(this.handle);
+    addon.voiceActivityDetectorClear(this.handle);
  }

  /*
@@ -79,11 +79,11 @@ config = {
  }

  reset() {
-    addon.VoiceActivityDetectorResetWrapper(this.handle);
+    addon.voiceActivityDetectorReset(this.handle);
  }

  flush() {
-    addon.VoiceActivityDetectorFlushWrapper(this.handle);
+    addon.voiceActivityDetectorFlush(this.handle);
  }
 }

--- a/sherpa-onnx/csrc/offline-stream.cc
+++ b/sherpa-onnx/csrc/offline-stream.cc
@@ -306,8 +306,7 @@ std::string OfflineRecognitionResult::AsJsonString() const {
  os << "{";
  os << "\"text\""
     << ": ";
-  os << "\"" << text << "\""
-     << ", ";
+  os << std::quoted(text) << ", ";

  os << "\""
     << "timestamps"
@@ -339,7 +338,7 @@ std::string OfflineRecognitionResult::AsJsonString() const {
         << "\"";
      os.flags(oldFlags);
    } else {
-      os << sep << "\"" << t << "\"";
+      os << sep << std::quoted(t);
    }
    sep = ", ";
  }
--- a/sherpa-onnx/csrc/online-recognizer.cc
+++ b/sherpa-onnx/csrc/online-recognizer.cc
@@ -44,7 +44,7 @@ std::string VecToString<std::string>(const std::vector<std::string> &vec,
  oss << "[";
  std::string sep = "";
  for (const auto &item : vec) {
-    oss << sep << "\"" << item << "\"";
+    oss << sep << std::quoted(item);
    sep = ", ";
  }
  oss << "]";
@@ -54,9 +54,7 @@ std::string VecToString<std::string>(const std::vector<std::string> &vec,
 std::string OnlineRecognizerResult::AsJsonString() const {
  std::ostringstream os;
  os << "{ ";
-  os << "\"text\": "
-     << "\"" << text << "\""
-     << ", ";
+  os << "\"text\": " << std::quoted(text) << ", ";
  os << "\"tokens\": " << VecToString(tokens) << ", ";
  os << "\"timestamps\": " << VecToString(timestamps, 2) << ", ";
  os << "\"ys_probs\": " << VecToString(ys_probs, 6) << ", ";