Fix for silero vad v5. (#1065)
The network input is 64 + 512 samples instead of 512 samples for 16kHz.
This commit is contained in:
@@ -74,9 +74,8 @@ class SileroVadModel::Impl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool IsSpeech(const float *samples, int32_t n) {
|
bool IsSpeech(const float *samples, int32_t n) {
|
||||||
if (n != config_.silero_vad.window_size) {
|
if (n != WindowSize()) {
|
||||||
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n,
|
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize());
|
||||||
config_.silero_vad.window_size);
|
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -146,9 +145,11 @@ class SileroVadModel::Impl {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t WindowSize() const { return config_.silero_vad.window_size; }
|
int32_t WindowShift() const { return config_.silero_vad.window_size; }
|
||||||
|
|
||||||
int32_t WindowShift() const { return WindowSize() - window_shift_; }
|
int32_t WindowSize() const {
|
||||||
|
return config_.silero_vad.window_size + window_overlap_;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
|
int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }
|
||||||
|
|
||||||
@@ -177,9 +178,9 @@ class SileroVadModel::Impl {
|
|||||||
|
|
||||||
// 64 for 16kHz
|
// 64 for 16kHz
|
||||||
// 32 for 8kHz
|
// 32 for 8kHz
|
||||||
window_shift_ = 64;
|
window_overlap_ = 64;
|
||||||
|
|
||||||
if (WindowSize() != 512) {
|
if (config_.silero_vad.window_size != 512) {
|
||||||
SHERPA_ONNX_LOGE(
|
SHERPA_ONNX_LOGE(
|
||||||
"For silero_vad v5, we require window_size to be 512 for 16kHz");
|
"For silero_vad v5, we require window_size to be 512 for 16kHz");
|
||||||
exit(-1);
|
exit(-1);
|
||||||
@@ -423,7 +424,7 @@ class SileroVadModel::Impl {
|
|||||||
int32_t temp_start_ = 0;
|
int32_t temp_start_ = 0;
|
||||||
int32_t temp_end_ = 0;
|
int32_t temp_end_ = 0;
|
||||||
|
|
||||||
int32_t window_shift_ = 0;
|
int32_t window_overlap_ = 0;
|
||||||
|
|
||||||
bool is_v5_ = false;
|
bool is_v5_ = false;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -37,11 +37,12 @@ class SileroVadModel : public VadModel {
|
|||||||
*/
|
*/
|
||||||
bool IsSpeech(const float *samples, int32_t n) override;
|
bool IsSpeech(const float *samples, int32_t n) override;
|
||||||
|
|
||||||
|
// For silero vad V4, it is WindowShift().
|
||||||
|
// For silero vad V5, it is WindowShift()+64 for 16kHz and
|
||||||
|
// WindowShift()+32 for 8kHz
|
||||||
int32_t WindowSize() const override;
|
int32_t WindowSize() const override;
|
||||||
|
|
||||||
// For silero vad V4, it is WindowSize().
|
// 512
|
||||||
// For silero vad V5, it is WindowSize()-64 for 16kHz and
|
|
||||||
// WindowSize()-32 for 8kHz
|
|
||||||
int32_t WindowShift() const override;
|
int32_t WindowShift() const override;
|
||||||
|
|
||||||
int32_t MinSilenceDurationSamples() const override;
|
int32_t MinSilenceDurationSamples() const override;
|
||||||
|
|||||||
@@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl {
|
|||||||
// an extra buffer here
|
// an extra buffer here
|
||||||
last_.insert(last_.end(), samples, samples + n);
|
last_.insert(last_.end(), samples, samples + n);
|
||||||
|
|
||||||
|
if (last_.size() < window_size) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Note: For v4, window_shift == window_size
|
// Note: For v4, window_shift == window_size
|
||||||
int32_t k =
|
int32_t k =
|
||||||
(static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1;
|
(static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1;
|
||||||
const float *p = last_.data();
|
const float *p = last_.data();
|
||||||
bool is_speech = false;
|
bool is_speech = false;
|
||||||
|
|
||||||
for (int32_t i = 0; i != k; ++i, p += window_shift) {
|
for (int32_t i = 0; i < k; ++i, p += window_shift) {
|
||||||
buffer_.Push(p, window_shift);
|
buffer_.Push(p, window_shift);
|
||||||
// NOTE(fangjun): Please don't use a very large n.
|
// NOTE(fangjun): Please don't use a very large n.
|
||||||
bool this_window_is_speech = model_->IsSpeech(p, window_size);
|
bool this_window_is_speech = model_->IsSpeech(p, window_size);
|
||||||
|
|||||||
Reference in New Issue
Block a user