diff --git a/sherpa-onnx/csrc/silero-vad-model.cc b/sherpa-onnx/csrc/silero-vad-model.cc index 1f8957d4..a0c1e6c5 100644 --- a/sherpa-onnx/csrc/silero-vad-model.cc +++ b/sherpa-onnx/csrc/silero-vad-model.cc @@ -190,6 +190,14 @@ class SileroVadModel::Impl { int32_t MinSpeechDurationSamples() const { return min_speech_samples_; } + void SetMinSilenceDuration(float s) { + min_silence_samples_ = sample_rate_ * s; + } + + void SetThreshold(float threshold) { + config_.silero_vad.threshold = threshold; + } + private: void Init(void *model_data, size_t model_data_length) { sess_ = std::make_unique(env_, model_data, model_data_length, @@ -306,4 +314,12 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const { return impl_->MinSpeechDurationSamples(); } +void SileroVadModel::SetMinSilenceDuration(float s) { + impl_->SetMinSilenceDuration(s); +} + +void SileroVadModel::SetThreshold(float threshold) { + impl_->SetThreshold(threshold); +} + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/silero-vad-model.h b/sherpa-onnx/csrc/silero-vad-model.h index 9539890a..d83e6832 100644 --- a/sherpa-onnx/csrc/silero-vad-model.h +++ b/sherpa-onnx/csrc/silero-vad-model.h @@ -42,6 +42,9 @@ class SileroVadModel : public VadModel { int32_t MinSilenceDurationSamples() const override; int32_t MinSpeechDurationSamples() const override; + void SetMinSilenceDuration(float s) override; + void SetThreshold(float threshold) override; + private: class Impl; std::unique_ptr impl_; diff --git a/sherpa-onnx/csrc/vad-model.h b/sherpa-onnx/csrc/vad-model.h index 8131b6af..f3b2aab0 100644 --- a/sherpa-onnx/csrc/vad-model.h +++ b/sherpa-onnx/csrc/vad-model.h @@ -42,6 +42,8 @@ class VadModel { virtual int32_t MinSilenceDurationSamples() const = 0; virtual int32_t MinSpeechDurationSamples() const = 0; + virtual void SetMinSilenceDuration(float s) = 0; + virtual void SetThreshold(float threshold) = 0; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc index 5f63acf1..9b2b1b87 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.cc +++ b/sherpa-onnx/csrc/voice-activity-detector.cc @@ -29,6 +29,14 @@ class VoiceActivityDetector::Impl { #endif void AcceptWaveform(const float *samples, int32_t n) { + if (buffer_.Size() > max_utterance_length_) { + model_->SetMinSilenceDuration(new_min_silence_duration_s_); + model_->SetThreshold(new_threshold_); + } else { + model_->SetMinSilenceDuration(config_.silero_vad.min_silence_duration); + model_->SetThreshold(config_.silero_vad.threshold); + } + int32_t window_size = model_->WindowSize(); // note n is usually window_size and there is no need to use @@ -114,6 +122,10 @@ class VoiceActivityDetector::Impl { CircularBuffer buffer_; std::vector last_; + int max_utterance_length_ = 16000 * 20; // in samples + float new_min_silence_duration_s_ = 0.1; + float new_threshold_ = 1.10; + int32_t start_ = -1; };