Fix for silero vad v5. (#1065)

The network input is 64 + 512 samples instead of 512 samples for 16kHz.
This commit is contained in:
Fangjun Kuang
2024-06-30 08:57:23 +08:00
committed by GitHub
parent 61c7eb3063
commit 6cb018184e
3 changed files with 18 additions and 12 deletions

View File

@@ -37,11 +37,12 @@ class SileroVadModel : public VadModel {
*/
bool IsSpeech(const float *samples, int32_t n) override;
// For silero vad V4, it is WindowShift().
// For silero vad V5, it is WindowShift()+64 for 16kHz and
// WindowShift()+32 for 8kHz
int32_t WindowSize() const override;
// For silero vad V4, it is WindowSize().
// For silero vad V5, it is WindowSize()-64 for 16kHz and
// WindowSize()-32 for 8kHz
// 512
int32_t WindowShift() const override;
int32_t MinSilenceDurationSamples() const override;