Add Flush to VAD so that the last segment can be detected. (#1099)

2024-07-09 16:15:56 +08:00
parent 3e4307e2fb
commit c2cc9dec58
35 changed files with 237 additions and 29 deletions
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
  p->impl->Reset();
 }

+void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) {
+  p->impl->Flush();
+}
+
 #if SHERPA_ONNX_ENABLE_TTS == 1
 struct SherpaOnnxOfflineTts {
  std::unique_ptr<sherpa_onnx::OfflineTts> impl;
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
 SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
    SherpaOnnxVoiceActivityDetector *p);

+SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush(
+    SherpaOnnxVoiceActivityDetector *p);
+
 // ============================================================
 // For offline Text-to-Speech (i.e., non-streaming TTS)
 // ============================================================
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
@@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl {
    start_ = -1;
  }

+  void Flush() {
+    if (start_ == -1 || buffer_.Size() == 0) {
+      return;
+    }
+
+    int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
+    if (end <= start_) {
+      return;
+    }
+
+    std::vector<float> s = buffer_.Get(start_, end - start_);
+
+    SpeechSegment segment;
+
+    segment.start = start_;
+    segment.samples = std::move(s);
+
+    segments_.push(std::move(segment));
+
+    buffer_.Pop(end - buffer_.Head());
+    start_ = -1;
+  }
+
  bool IsSpeechDetected() const { return start_ != -1; }

  const VadModelConfig &GetConfig() const { return config_; }
@@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const {
  return impl_->Front();
 }

-void VoiceActivityDetector::Reset() { impl_->Reset(); }
+void VoiceActivityDetector::Reset() const { impl_->Reset(); }
+
+void VoiceActivityDetector::Flush() const { impl_->Flush(); }

 bool VoiceActivityDetector::IsSpeechDetected() const {
  return impl_->IsSpeechDetected();
--- a/sherpa-onnx/csrc/voice-activity-detector.h
+++ b/sherpa-onnx/csrc/voice-activity-detector.h
@@ -41,7 +41,11 @@ class VoiceActivityDetector {

  bool IsSpeechDetected() const;

-  void Reset();
+  void Reset() const;
+
+  // At the end of the utterance, you can invoke this method so that
+  // the last speech segment can be detected.
+  void Flush() const;

  const VadModelConfig &GetConfig() const;

--- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
+++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/Vad.java
@@ -46,6 +46,10 @@ public class Vad {
        reset(this.ptr);
    }

+    public void flush() {
+        flush(this.ptr);
+    }
+
    public SpeechSegment front() {
        Object[] arr = front(this.ptr);
        int start = (int) arr[0];
@@ -75,4 +79,6 @@ public class Vad {
    private native boolean isSpeechDetected(long ptr);

    private native void reset(long ptr);
+
+    private native void flush(long ptr);
 }
--- a/sherpa-onnx/jni/voice-activity-detector.cc
+++ b/sherpa-onnx/jni/voice-activity-detector.cc
@@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/,
  auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
  model->Reset();
 }
+
+SHERPA_ONNX_EXTERN_C
+JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/,
+                                                            jobject /*obj*/,
+                                                            jlong ptr) {
+  auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
+  model->Flush();
+}
--- a/sherpa-onnx/kotlin-api/Vad.kt
+++ b/sherpa-onnx/kotlin-api/Vad.kt
@@ -52,6 +52,8 @@ class Vad(

    fun reset() = reset(ptr)

+    fun flush() = flush(ptr)
+
    private external fun delete(ptr: Long)

    private external fun newFromAsset(
@@ -70,6 +72,7 @@ class Vad(
    private external fun front(ptr: Long): Array<Any>
    private external fun isSpeechDetected(ptr: Long): Boolean
    private external fun reset(ptr: Long)
+    private external fun flush(ptr: Long)

    companion object {
        init {
--- a/sherpa-onnx/python/csrc/voice-activity-detector.cc
+++ b/sherpa-onnx/python/csrc/voice-activity-detector.cc
@@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) {
      .def("is_speech_detected", &PyClass::IsSpeechDetected,
           py::call_guard<py::gil_scoped_release>())
      .def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
+      .def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
      .def_property_readonly("front", &PyClass::Front);
 }