Support specifying max speech duration for VAD. (#1348)
This commit is contained in:
@@ -406,7 +406,14 @@ def main():
|
|||||||
|
|
||||||
config = sherpa_onnx.VadModelConfig()
|
config = sherpa_onnx.VadModelConfig()
|
||||||
config.silero_vad.model = args.silero_vad_model
|
config.silero_vad.model = args.silero_vad_model
|
||||||
config.silero_vad.min_silence_duration = 0.25
|
config.silero_vad.threshold = 0.5
|
||||||
|
config.silero_vad.min_silence_duration = 0.25 # seconds
|
||||||
|
config.silero_vad.min_speech_duration = 0.25 # seconds
|
||||||
|
|
||||||
|
# If the current segment is larger than this value, then it increases
|
||||||
|
# the threshold to 0.9 internally. After detecting this segment,
|
||||||
|
# it resets the threshold to its original value.
|
||||||
|
config.silero_vad.max_speech_duration = 5 # seconds
|
||||||
config.sample_rate = args.sample_rate
|
config.sample_rate = args.sample_rate
|
||||||
|
|
||||||
window_size = config.silero_vad.window_size
|
window_size = config.silero_vad.window_size
|
||||||
|
|||||||
@@ -28,6 +28,12 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
|
|||||||
"In seconds. In the end of each silence chunk wait for "
|
"In seconds. In the end of each silence chunk wait for "
|
||||||
"--silero-vad-min-speech-duration seconds before separating it");
|
"--silero-vad-min-speech-duration seconds before separating it");
|
||||||
|
|
||||||
|
po->Register(
|
||||||
|
"silero-vad-max-speech-duration", &max_speech_duration,
|
||||||
|
"In seconds. If a speech segment is longer than this value, then we "
|
||||||
|
"increase the threshold to 0.9. After finishing detecting the segment, "
|
||||||
|
"the threshold value is reset to its original value.");
|
||||||
|
|
||||||
po->Register(
|
po->Register(
|
||||||
"silero-vad-window-size", &window_size,
|
"silero-vad-window-size", &window_size,
|
||||||
"In samples. Audio chunks of --silero-vad-window-size samples are fed "
|
"In samples. Audio chunks of --silero-vad-window-size samples are fed "
|
||||||
@@ -63,6 +69,33 @@ bool SileroVadModelConfig::Validate() const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (min_silence_duration <= 0) {
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"Please use a larger value for --silero-vad-min-silence-duration. "
|
||||||
|
"Given: "
|
||||||
|
"%f",
|
||||||
|
min_silence_duration);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min_speech_duration <= 0) {
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"Please use a larger value for --silero-vad-min-speech-duration. "
|
||||||
|
"Given: "
|
||||||
|
"%f",
|
||||||
|
min_speech_duration);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (max_speech_duration <= 0) {
|
||||||
|
SHERPA_ONNX_LOGE(
|
||||||
|
"Please use a larger value for --silero-vad-max-speech-duration. "
|
||||||
|
"Given: "
|
||||||
|
"%f",
|
||||||
|
max_speech_duration);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -74,6 +107,7 @@ std::string SileroVadModelConfig::ToString() const {
|
|||||||
os << "threshold=" << threshold << ", ";
|
os << "threshold=" << threshold << ", ";
|
||||||
os << "min_silence_duration=" << min_silence_duration << ", ";
|
os << "min_silence_duration=" << min_silence_duration << ", ";
|
||||||
os << "min_speech_duration=" << min_speech_duration << ", ";
|
os << "min_speech_duration=" << min_speech_duration << ", ";
|
||||||
|
os << "max_speech_duration=" << max_speech_duration << ", ";
|
||||||
os << "window_size=" << window_size << ")";
|
os << "window_size=" << window_size << ")";
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
|
|||||||
@@ -27,6 +27,11 @@ struct SileroVadModelConfig {
|
|||||||
// 256, 512, 768 samples for 800 Hz
|
// 256, 512, 768 samples for 800 Hz
|
||||||
int32_t window_size = 512; // in samples
|
int32_t window_size = 512; // in samples
|
||||||
|
|
||||||
|
// If a speech segment is longer than this value, then we increase
|
||||||
|
// the threshold to 0.9. After finishing detecting the segment,
|
||||||
|
// the threshold value is reset to its original value.
|
||||||
|
float max_speech_duration = 20; // in seconds
|
||||||
|
|
||||||
SileroVadModelConfig() = default;
|
SileroVadModelConfig() = default;
|
||||||
|
|
||||||
void Register(ParseOptions *po);
|
void Register(ParseOptions *po);
|
||||||
|
|||||||
@@ -18,14 +18,18 @@ class VoiceActivityDetector::Impl {
|
|||||||
explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60)
|
explicit Impl(const VadModelConfig &config, float buffer_size_in_seconds = 60)
|
||||||
: model_(VadModel::Create(config)),
|
: model_(VadModel::Create(config)),
|
||||||
config_(config),
|
config_(config),
|
||||||
buffer_(buffer_size_in_seconds * config.sample_rate) {}
|
buffer_(buffer_size_in_seconds * config.sample_rate) {
|
||||||
|
Init();
|
||||||
|
}
|
||||||
|
|
||||||
#if __ANDROID_API__ >= 9
|
#if __ANDROID_API__ >= 9
|
||||||
Impl(AAssetManager *mgr, const VadModelConfig &config,
|
Impl(AAssetManager *mgr, const VadModelConfig &config,
|
||||||
float buffer_size_in_seconds = 60)
|
float buffer_size_in_seconds = 60)
|
||||||
: model_(VadModel::Create(mgr, config)),
|
: model_(VadModel::Create(mgr, config)),
|
||||||
config_(config),
|
config_(config),
|
||||||
buffer_(buffer_size_in_seconds * config.sample_rate) {}
|
buffer_(buffer_size_in_seconds * config.sample_rate) {
|
||||||
|
Init();
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void AcceptWaveform(const float *samples, int32_t n) {
|
void AcceptWaveform(const float *samples, int32_t n) {
|
||||||
@@ -145,6 +149,15 @@ class VoiceActivityDetector::Impl {
|
|||||||
|
|
||||||
const VadModelConfig &GetConfig() const { return config_; }
|
const VadModelConfig &GetConfig() const { return config_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
void Init() {
|
||||||
|
// TODO(fangjun): Currently, we support only one vad model.
|
||||||
|
// If a new vad model is added, we need to change the place
|
||||||
|
// where max_speech_duration is placed.
|
||||||
|
max_utterance_length_ =
|
||||||
|
config_.sample_rate * config_.silero_vad.max_speech_duration;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::queue<SpeechSegment> segments_;
|
std::queue<SpeechSegment> segments_;
|
||||||
|
|
||||||
@@ -153,9 +166,9 @@ class VoiceActivityDetector::Impl {
|
|||||||
CircularBuffer buffer_;
|
CircularBuffer buffer_;
|
||||||
std::vector<float> last_;
|
std::vector<float> last_;
|
||||||
|
|
||||||
int max_utterance_length_ = 16000 * 20; // in samples
|
int max_utterance_length_ = -1; // in samples
|
||||||
float new_min_silence_duration_s_ = 0.1;
|
float new_min_silence_duration_s_ = 0.1;
|
||||||
float new_threshold_ = 1.10;
|
float new_threshold_ = 0.90;
|
||||||
|
|
||||||
int32_t start_ = -1;
|
int32_t start_ = -1;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -17,7 +17,8 @@ void PybindSileroVadModelConfig(py::module *m) {
|
|||||||
.def(py::init<>())
|
.def(py::init<>())
|
||||||
.def(py::init([](const std::string &model, float threshold,
|
.def(py::init([](const std::string &model, float threshold,
|
||||||
float min_silence_duration, float min_speech_duration,
|
float min_silence_duration, float min_speech_duration,
|
||||||
int32_t window_size) -> std::unique_ptr<PyClass> {
|
int32_t window_size,
|
||||||
|
float max_speech_duration) -> std::unique_ptr<PyClass> {
|
||||||
auto ans = std::make_unique<PyClass>();
|
auto ans = std::make_unique<PyClass>();
|
||||||
|
|
||||||
ans->model = model;
|
ans->model = model;
|
||||||
@@ -25,17 +26,20 @@ void PybindSileroVadModelConfig(py::module *m) {
|
|||||||
ans->min_silence_duration = min_silence_duration;
|
ans->min_silence_duration = min_silence_duration;
|
||||||
ans->min_speech_duration = min_speech_duration;
|
ans->min_speech_duration = min_speech_duration;
|
||||||
ans->window_size = window_size;
|
ans->window_size = window_size;
|
||||||
|
ans->max_speech_duration = max_speech_duration;
|
||||||
|
|
||||||
return ans;
|
return ans;
|
||||||
}),
|
}),
|
||||||
py::arg("model"), py::arg("threshold") = 0.5,
|
py::arg("model"), py::arg("threshold") = 0.5,
|
||||||
py::arg("min_silence_duration") = 0.5,
|
py::arg("min_silence_duration") = 0.5,
|
||||||
py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512)
|
py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512,
|
||||||
|
py::arg("max_speech_duration") = 20)
|
||||||
.def_readwrite("model", &PyClass::model)
|
.def_readwrite("model", &PyClass::model)
|
||||||
.def_readwrite("threshold", &PyClass::threshold)
|
.def_readwrite("threshold", &PyClass::threshold)
|
||||||
.def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
|
.def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
|
||||||
.def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
|
.def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
|
||||||
.def_readwrite("window_size", &PyClass::window_size)
|
.def_readwrite("window_size", &PyClass::window_size)
|
||||||
|
.def_readwrite("max_speech_duration", &PyClass::max_speech_duration)
|
||||||
.def("__str__", &PyClass::ToString)
|
.def("__str__", &PyClass::ToString)
|
||||||
.def("validate", &PyClass::Validate);
|
.def("validate", &PyClass::Validate);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user