Add Flush to VAD so that the last segment can be detected. (#1099)

2024-07-09 16:15:56 +08:00
parent 3e4307e2fb
commit c2cc9dec58
35 changed files with 237 additions and 29 deletions
--- a/.github/workflows/dot-net.yaml
+++ b/.github/workflows/dot-net.yaml
@@ -52,11 +52,6 @@ jobs:
          cmake --build . --target install --config Release
          rm -rf install/pkgconfig
      - uses: actions/upload-artifact@v4
        with:
          name: windows-${{ matrix.arch }}
          path: ./build/install/lib/
      - name: Create tar file
        shell: bash
        run: |
@@ -72,6 +67,11 @@ jobs:
          ls -lh *.tar.bz2
          mv *.tar.bz2 ../
      - uses: actions/upload-artifact@v4
        with:
          name: windows-${{ matrix.arch }}
          path: ./*.tar.bz2
      # https://huggingface.co/docs/hub/spaces-github-actions
      - name: Publish to huggingface
        if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch')
@@ -88,7 +88,9 @@ jobs:
            rm -rf huggingface
            export GIT_CLONE_PROTECTION_ACTIVE=false
-            GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
            git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
            cd huggingface
            mkdir -p windows-for-dotnet
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
 ## 1.10.12
 * Add Flush to VAD so that the last speech segment can be detected. See also
  https://github.com/k2-fsa/sherpa-onnx/discussions/1077#discussioncomment-9979740
 ## 1.10.11
 * Support the iOS platform for iOS.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ project(sherpa-onnx)
 # Remember to update
 # ./nodejs-addon-examples
 # ./dart-api-examples/
-# ./sherpa-onnx/flutter/CHANGELOG.md
+# ./CHANGELOG.md
-set(SHERPA_ONNX_VERSION "1.10.11")
+set(SHERPA_ONNX_VERSION "1.10.12")
 # Disable warning about
 #
--- a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart
+++ b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart
@@ -93,6 +93,28 @@ void main(List<String> arguments) async {
    }
  }
  vad.flush();
  while (!vad.isEmpty()) {
    final stream = recognizer.createStream();
    final segment = vad.front();
    stream.acceptWaveform(
        samples: segment.samples, sampleRate: waveData.sampleRate);
    recognizer.decode(stream);
    final result = recognizer.getResult(stream);
    final startTime = segment.start * 1.0 / waveData.sampleRate;
    final duration = segment.samples.length * 1.0 / waveData.sampleRate;
    final stopTime = startTime + duration;
    if (result.text != '') {
      print(
          '${startTime.toStringAsPrecision(4)} -- ${stopTime.toStringAsPrecision(4)}: ${result.text}');
    }
    stream.free();
    vad.pop();
  }
  vad.free();
  recognizer.free();
 }
--- a/dart-api-examples/non-streaming-asr/pubspec.yaml
+++ b/dart-api-examples/non-streaming-asr/pubspec.yaml
@@ -10,7 +10,7 @@ environment:
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
  path: ^1.9.0
  args: ^2.5.0
--- a/dart-api-examples/streaming-asr/pubspec.yaml
+++ b/dart-api-examples/streaming-asr/pubspec.yaml
@@ -11,7 +11,7 @@ environment:
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
  path: ^1.9.0
  args: ^2.5.0
--- a/dart-api-examples/tts/pubspec.yaml
+++ b/dart-api-examples/tts/pubspec.yaml
@@ -8,7 +8,7 @@ environment:
 # Add regular dependencies here.
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
  path: ^1.9.0
  args: ^2.5.0
--- a/dart-api-examples/vad/bin/vad.dart
+++ b/dart-api-examples/vad/bin/vad.dart
@@ -65,6 +65,12 @@ void main(List<String> arguments) async {
    }
  }
  vad.flush();
  while (!vad.isEmpty()) {
    allSamples.add(vad.front().samples);
    vad.pop();
  }
  vad.free();
  final s = Float32List.fromList(allSamples.expand((x) => x).toList());
--- a/dart-api-examples/vad/pubspec.yaml
+++ b/dart-api-examples/vad/pubspec.yaml
@@ -9,7 +9,7 @@ environment:
  sdk: ^3.4.0
 dependencies:
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
  path: ^1.9.0
  args: ^2.5.0
--- a/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs
+++ b/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs
@@ -57,6 +57,26 @@ class VadNonStreamingAsrParaformer
        }
      }
    }
    vad.Flush();
    while (!vad.IsEmpty()) {
      SpeechSegment segment = vad.Front();
      float startTime = segment.Start / (float)sampleRate;
      float duration = segment.Samples.Length / (float)sampleRate;
      OfflineStream stream = recognizer.CreateStream();
      stream.AcceptWaveform(sampleRate, segment.Samples);
      recognizer.Decode(stream);
      String text = stream.Result.Text;
      if (!String.IsNullOrEmpty(text)) {
        Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
            String.Format("{0:0.00}", startTime+duration), text);
      }
      vad.Pop();
    }
  }
 }
--- a/flutter-examples/streaming_asr/pubspec.yaml
+++ b/flutter-examples/streaming_asr/pubspec.yaml
@@ -5,7 +5,7 @@ description: >
 publish_to: 'none'
-version: 1.10.11
+version: 1.10.12
 topics:
  - speech-recognition
@@ -30,7 +30,7 @@ dependencies:
  record: ^5.1.0
  url_launcher: ^6.2.6
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
  # sherpa_onnx:
    # path: ../../flutter/sherpa_onnx
--- a/flutter-examples/tts/pubspec.yaml
+++ b/flutter-examples/tts/pubspec.yaml
@@ -17,7 +17,7 @@ dependencies:
  cupertino_icons: ^1.0.6
  path_provider: ^2.1.3
  path: ^1.9.0
-  sherpa_onnx: ^1.10.11
+  sherpa_onnx: ^1.10.12
  url_launcher: ^6.2.6
  audioplayers: ^5.0.0
--- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
+++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
@@ -491,6 +491,12 @@ typedef SherpaOnnxVoiceActivityDetectorResetNative = Void Function(
 typedef SherpaOnnxVoiceActivityDetectorReset = void Function(
    Pointer<SherpaOnnxVoiceActivityDetector>);
 typedef SherpaOnnxVoiceActivityDetectorFlushNative = Void Function(
    Pointer<SherpaOnnxVoiceActivityDetector>);
 typedef SherpaOnnxVoiceActivityDetectorFlush = void Function(
    Pointer<SherpaOnnxVoiceActivityDetector>);
 typedef SherpaOnnxVoiceActivityDetectorFrontNative
    = Pointer<SherpaOnnxSpeechSegment> Function(
        Pointer<SherpaOnnxVoiceActivityDetector>);
@@ -779,6 +785,8 @@ class SherpaOnnxBindings {
  static SherpaOnnxVoiceActivityDetectorReset? voiceActivityDetectorReset;
  static SherpaOnnxVoiceActivityDetectorFlush? voiceActivityDetectorFlush;
  static SherpaOnnxCreateCircularBuffer? createCircularBuffer;
  static SherpaOnnxDestroyCircularBuffer? destroyCircularBuffer;
@@ -1036,6 +1044,11 @@ class SherpaOnnxBindings {
            'SherpaOnnxVoiceActivityDetectorReset')
        .asFunction();
    voiceActivityDetectorFlush ??= dynamicLibrary
        .lookup<NativeFunction<SherpaOnnxVoiceActivityDetectorFlushNative>>(
            'SherpaOnnxVoiceActivityDetectorFlush')
        .asFunction();
    createCircularBuffer ??= dynamicLibrary
        .lookup<NativeFunction<SherpaOnnxCreateCircularBufferNative>>(
            'SherpaOnnxCreateCircularBuffer')
--- a/flutter/sherpa_onnx/lib/src/vad.dart
+++ b/flutter/sherpa_onnx/lib/src/vad.dart
@@ -207,6 +207,10 @@ class VoiceActivityDetector {
    SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr);
  }
  void flush() {
    SherpaOnnxBindings.voiceActivityDetectorFlush?.call(ptr);
  }
  Pointer<SherpaOnnxVoiceActivityDetector> ptr;
  final VadModelConfig config;
 }
--- a/flutter/sherpa_onnx/pubspec.yaml
+++ b/flutter/sherpa_onnx/pubspec.yaml
@@ -17,7 +17,7 @@ topics:
  - voice-activity-detection
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
-version: 1.10.11
+version: 1.10.12
 homepage: https://github.com/k2-fsa/sherpa-onnx
@@ -30,19 +30,19 @@ dependencies:
  flutter:
    sdk: flutter
-  sherpa_onnx_android: ^1.10.11
+  sherpa_onnx_android: ^1.10.12
    # path: ../sherpa_onnx_android
-  sherpa_onnx_macos: ^1.10.11
+  sherpa_onnx_macos: ^1.10.12
    # path: ../sherpa_onnx_macos
-  sherpa_onnx_linux: ^1.10.11
+  sherpa_onnx_linux: ^1.10.12
    # path: ../sherpa_onnx_linux
    #
-  sherpa_onnx_windows: ^1.10.11
+  sherpa_onnx_windows: ^1.10.12
    # path: ../sherpa_onnx_windows
-  sherpa_onnx_ios: ^1.10.11
+  sherpa_onnx_ios: ^1.10.12
  # sherpa_onnx_ios:
    # path: ../sherpa_onnx_ios
--- a/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
+++ b/flutter/sherpa_onnx_ios/ios/sherpa_onnx_ios.podspec
@@ -7,7 +7,7 @@
 # https://groups.google.com/g/dart-ffi/c/nUATMBy7r0c
 Pod::Spec.new do |s|
  s.name             = 'sherpa_onnx_ios'
-  s.version          = '1.10.11'
+  s.version          = '1.10.12'
  s.summary          = 'A new Flutter FFI plugin project.'
  s.description      = <<-DESC
 A new Flutter FFI plugin project.
--- a/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
+++ b/flutter/sherpa_onnx_macos/macos/sherpa_onnx_macos.podspec
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
  s.name             = 'sherpa_onnx_macos'
-  s.version          = '1.10.11'
+  s.version          = '1.10.12'
  s.summary          = 'sherpa-onnx Flutter FFI plugin project.'
  s.description      = <<-DESC
 sherpa-onnx Flutter FFI plugin project.
--- a/java-api-examples/VadNonStreamingParaformer.java
+++ b/java-api-examples/VadNonStreamingParaformer.java
@@ -98,6 +98,25 @@ public class VadNonStreamingParaformer {
      }
    }
    vad.flush();
    while (!vad.empty()) {
      SpeechSegment segment = vad.front();
      float startTime = segment.getStart() / 16000.0f;
      float duration = segment.getSamples().length / 16000.0f;
      OfflineStream stream = recognizer.createStream();
      stream.acceptWaveform(segment.getSamples(), 16000);
      recognizer.decode(stream);
      String text = recognizer.getResult(stream).getText();
      stream.release();
      if (!text.isEmpty()) {
        System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
      }
      vad.pop();
    }
    vad.release();
    recognizer.release();
  }
--- a/java-api-examples/VadRemoveSilence.java
+++ b/java-api-examples/VadRemoveSilence.java
@@ -59,6 +59,16 @@ public class VadRemoveSilence {
      }
    }
    vad.flush();
    while (!vad.empty()) {
      // if you want to get the starting time of this segment, you can use
      /* float startTime = vad.front().getStart() / 16000.0f; */
      segments.add(vad.front().getSamples());
      vad.pop();
    }
    // get total number of samples
    int n = 0;
    for (float[] s : segments) {
--- a/nodejs-addon-examples/package.json
+++ b/nodejs-addon-examples/package.json
@@ -1,5 +1,5 @@
 {
  "dependencies": {
-    "sherpa-onnx-node": "^1.10.6"
+    "sherpa-onnx-node": "^1.10.12"
  }
 }
--- a/python-api-examples/vad-remove-non-speech-segments-from-file.py
+++ b/python-api-examples/vad-remove-non-speech-segments-from-file.py
@@ -105,6 +105,12 @@ def main():
            speech_samples.extend(vad.front.samples)
            vad.pop()
    vad.flush()
    while not vad.empty():
        speech_samples.extend(vad.front.samples)
        vad.pop()
    speech_samples = np.array(speech_samples, dtype=np.float32)
    sf.write(args.output, speech_samples, samplerate=sample_rate)
--- a/scripts/dart/sherpa-onnx-pubspec.yaml
+++ b/scripts/dart/sherpa-onnx-pubspec.yaml
@@ -17,7 +17,7 @@ topics:
  - voice-activity-detection
 # remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
-version: 1.10.6
+version: 1.10.12
 homepage: https://github.com/k2-fsa/sherpa-onnx
--- a/scripts/dotnet/VoiceActivityDetector.cs
+++ b/scripts/dotnet/VoiceActivityDetector.cs
@@ -53,6 +53,11 @@ namespace SherpaOnnx
            SherpaOnnxVoiceActivityDetectorReset(_handle.Handle);
        }
        public void Flush()
        {
            SherpaOnnxVoiceActivityDetectorFlush(_handle.Handle);
        }
        public void Dispose()
        {
            Cleanup();
@@ -106,5 +111,7 @@ namespace SherpaOnnx
        [DllImport(Dll.Filename)]
        private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle);
        [DllImport(Dll.Filename)]
        private static extern void SherpaOnnxVoiceActivityDetectorFlush(IntPtr handle);
    }
 }
--- a/scripts/go/sherpa_onnx.go
+++ b/scripts/go/sherpa_onnx.go
@@ -856,6 +856,10 @@ func (vad *VoiceActivityDetector) Reset() {
 	C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
 }
 func (vad *VoiceActivityDetector) Flush() {
 	C.SherpaOnnxVoiceActivityDetectorFlush(vad.impl)
 }
 // Spoken language identification
 type SpokenLanguageIdentificationWhisperConfig struct {
--- a/scripts/node-addon-api/lib/vad.js
+++ b/scripts/node-addon-api/lib/vad.js
@@ -29,7 +29,7 @@ class CircularBuffer {
  }
  reset() {
-    return addon.circularBufferReset(this.handle);
+    addon.circularBufferReset(this.handle);
  }
 }
@@ -79,7 +79,11 @@ config = {
  }
  reset() {
-    return addon.VoiceActivityDetectorResetWrapper(this.handle);
+    addon.VoiceActivityDetectorResetWrapper(this.handle);
  }
  flush() {
    addon.VoiceActivityDetectorFlushWrapper(this.handle);
  }
 }
--- a/scripts/node-addon-api/src/vad.cc
+++ b/scripts/node-addon-api/src/vad.cc
@@ -590,6 +590,31 @@ static void VoiceActivityDetectorResetWrapper(const Napi::CallbackInfo &info) {
  SherpaOnnxVoiceActivityDetectorReset(vad);
 }
 static void VoiceActivityDetectorFlushWrapper(const Napi::CallbackInfo &info) {
  Napi::Env env = info.Env();
  if (info.Length() != 1) {
    std::ostringstream os;
    os << "Expect only 1 argument. Given: " << info.Length();
    Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException();
    return;
  }
  if (!info[0].IsExternal()) {
    Napi::TypeError::New(env, "Argument 0 should be a VAD pointer.")
        .ThrowAsJavaScriptException();
    return;
  }
  SherpaOnnxVoiceActivityDetector *vad =
      info[0].As<Napi::External<SherpaOnnxVoiceActivityDetector>>().Data();
  SherpaOnnxVoiceActivityDetectorFlush(vad);
 }
 void InitVad(Napi::Env env, Napi::Object exports) {
  exports.Set(Napi::String::New(env, "createCircularBuffer"),
              Napi::Function::New(env, CreateCircularBufferWrapper));
@@ -636,4 +661,7 @@ void InitVad(Napi::Env env, Napi::Object exports) {
  exports.Set(Napi::String::New(env, "voiceActivityDetectorReset"),
              Napi::Function::New(env, VoiceActivityDetectorResetWrapper));
  exports.Set(Napi::String::New(env, "voiceActivityDetectorFlush"),
              Napi::Function::New(env, VoiceActivityDetectorFlushWrapper));
 }
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Reset();
 }
 void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Flush();
 }
 #if SHERPA_ONNX_ENABLE_TTS == 1
 struct SherpaOnnxOfflineTts {
  std::unique_ptr<sherpa_onnx::OfflineTts> impl;
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
    SherpaOnnxVoiceActivityDetector *p);
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush(
    SherpaOnnxVoiceActivityDetector *p);
 // ============================================================
 // For offline Text-to-Speech (i.e., non-streaming TTS)
 // ============================================================
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
@@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl {
    start_ = -1;
  }
  void Flush() {
    if (start_ == -1 || buffer_.Size() == 0) {
      return;
    }
    int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
    if (end <= start_) {
      return;
    }
    std::vector<float> s = buffer_.Get(start_, end - start_);
    SpeechSegment segment;
    segment.start = start_;
    segment.samples = std::move(s);
    segments_.push(std::move(segment));
    buffer_.Pop(end - buffer_.Head());
    start_ = -1;
  }
  bool IsSpeechDetected() const { return start_ != -1; }
  const VadModelConfig &GetConfig() const { return config_; }
@@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const {
  return impl_->Front();
 }
-void VoiceActivityDetector::Reset() { impl_->Reset(); }
+void VoiceActivityDetector::Reset() const { impl_->Reset(); }
 void VoiceActivityDetector::Flush() const { impl_->Flush(); }
 bool VoiceActivityDetector::IsSpeechDetected() const {
  return impl_->IsSpeechDetected();
--- a/sherpa-onnx/csrc/voice-activity-detector.h
+++ b/sherpa-onnx/csrc/voice-activity-detector.h
@@ -41,7 +41,11 @@ class VoiceActivityDetector {
  bool IsSpeechDetected() const;
-  void Reset();
+  void Reset() const;
  // At the end of the utterance, you can invoke this method so that
  // the last speech segment can be detected.
  void Flush() const;
  const VadModelConfig &GetConfig() const;
--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
@@ -46,6 +46,10 @@ public class Vad {
        reset(this.ptr);
    }
    public void flush() {
        flush(this.ptr);
    }
    public SpeechSegment front() {
        Object[] arr = front(this.ptr);
        int start = (int) arr[0];
@@ -75,4 +79,6 @@ public class Vad {
    private native boolean isSpeechDetected(long ptr);
    private native void reset(long ptr);
    private native void flush(long ptr);
 }
--- a/sherpa-onnx/jni/voice-activity-detector.cc
+++ b/sherpa-onnx/jni/voice-activity-detector.cc
@@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/,
  auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
  model->Reset();
 }
 SHERPA_ONNX_EXTERN_C
 JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/,
                                                            jobject /*obj*/,
                                                            jlong ptr) {
  auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
  model->Flush();
 }
--- a/sherpa-onnx/kotlin-api/Vad.kt
+++ b/sherpa-onnx/kotlin-api/Vad.kt
@@ -52,6 +52,8 @@ class Vad(
    fun reset() = reset(ptr)
    fun flush() = flush(ptr)
    private external fun delete(ptr: Long)
    private external fun newFromAsset(
@@ -70,6 +72,7 @@ class Vad(
    private external fun front(ptr: Long): Array<Any>
    private external fun isSpeechDetected(ptr: Long): Boolean
    private external fun reset(ptr: Long)
    private external fun flush(ptr: Long)
    companion object {
        init {
--- a/sherpa-onnx/python/csrc/voice-activity-detector.cc
+++ b/sherpa-onnx/python/csrc/voice-activity-detector.cc
@@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) {
      .def("is_speech_detected", &PyClass::IsSpeechDetected,
           py::call_guard<py::gil_scoped_release>())
      .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
      .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
      .def_property_readonly("front", &PyClass::Front);
 }
--- a/swift-api-examples/SherpaOnnx.swift
+++ b/swift-api-examples/SherpaOnnx.swift
@@ -633,6 +633,10 @@ class SherpaOnnxVoiceActivityDetectorWrapper {
  func reset() {
    SherpaOnnxVoiceActivityDetectorReset(vad)
  }
  func flush() {
    SherpaOnnxVoiceActivityDetectorFlush(vad)
  }
 }
 // offline tts