Add Flush to VAD so that the last segment can be detected. (#1099)

2024-07-09 16:15:56 +08:00
parent 3e4307e2fb
commit c2cc9dec58
35 changed files with 237 additions and 29 deletions
--- a/sherpa-onnx/csrc/voice-activity-detector.cc
+++ b/sherpa-onnx/csrc/voice-activity-detector.cc
@@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl {
    start_ = -1;
  }

+  void Flush() {
+    if (start_ == -1 || buffer_.Size() == 0) {
+      return;
+    }
+
+    int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
+    if (end <= start_) {
+      return;
+    }
+
+    std::vector<float> s = buffer_.Get(start_, end - start_);
+
+    SpeechSegment segment;
+
+    segment.start = start_;
+    segment.samples = std::move(s);
+
+    segments_.push(std::move(segment));
+
+    buffer_.Pop(end - buffer_.Head());
+    start_ = -1;
+  }
+
  bool IsSpeechDetected() const { return start_ != -1; }

  const VadModelConfig &GetConfig() const { return config_; }
@@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const {
  return impl_->Front();
 }

-void VoiceActivityDetector::Reset() { impl_->Reset(); }
+void VoiceActivityDetector::Reset() const { impl_->Reset(); }
+
+void VoiceActivityDetector::Flush() const { impl_->Flush(); }

 bool VoiceActivityDetector::IsSpeechDetected() const {
  return impl_->IsSpeechDetected();
--- a/sherpa-onnx/csrc/voice-activity-detector.h
+++ b/sherpa-onnx/csrc/voice-activity-detector.h
@@ -41,7 +41,11 @@ class VoiceActivityDetector {

  bool IsSpeechDetected() const;

-  void Reset();
+  void Reset() const;
+
+  // At the end of the utterance, you can invoke this method so that
+  // the last speech segment can be detected.
+  void Flush() const;

  const VadModelConfig &GetConfig() const;