Add Silero VAD (#313)

2023-09-17 14:54:38 +08:00
parent 3a20e332bf
commit c471423125
36 changed files with 1683 additions and 16 deletions
--- a/sherpa-onnx/python/csrc/CMakeLists.txt
+++ b/sherpa-onnx/python/csrc/CMakeLists.txt
@@ -1,6 +1,7 @@
 include_directories(${CMAKE_SOURCE_DIR})

 pybind11_add_module(_sherpa_onnx
+  circular-buffer.cc
  display.cc
  endpoint.cc
  features.cc
@@ -20,6 +21,10 @@ pybind11_add_module(_sherpa_onnx
  online-stream.cc
  online-transducer-model-config.cc
  sherpa-onnx.cc
+  silero-vad-model-config.cc
+  vad-model-config.cc
+  vad-model.cc
+  voice-activity-detector.cc
 )

 if(APPLE)
--- a/sherpa-onnx/python/csrc/circular-buffer.cc
+++ b/sherpa-onnx/python/csrc/circular-buffer.cc
@@ -0,0 +1,31 @@
+// sherpa-onnx/python/csrc/circular-buffer.cc
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/circular-buffer.h"
+
+#include <vector>
+
+#include "sherpa-onnx/csrc/circular-buffer.h"
+
+namespace sherpa_onnx {
+
+void PybindCircularBuffer(py::module *m) {
+  using PyClass = CircularBuffer;
+  py::class_<PyClass>(*m, "CircularBuffer")
+      .def(py::init<int32_t>(), py::arg("capacity"))
+      .def(
+          "push",
+          [](PyClass &self, const std::vector<float> &samples) {
+            self.Push(samples.data(), samples.size());
+          },
+          py::arg("samples"))
+      .def("get", &PyClass::Get, py::arg("start_index"), py::arg("n"))
+      .def("pop", &PyClass::Pop, py::arg("n"))
+      .def("reset", &PyClass::Reset)
+      .def_property_readonly("size", &PyClass::Size)
+      .def_property_readonly("head", &PyClass::Head)
+      .def_property_readonly("tail", &PyClass::Tail);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/circular-buffer.h
+++ b/sherpa-onnx/python/csrc/circular-buffer.h
@@ -0,0 +1,16 @@
+// sherpa-onnx/python/csrc/circular-buffer.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
+#define SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindCircularBuffer(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
--- a/sherpa-onnx/python/csrc/sherpa-onnx.cc
+++ b/sherpa-onnx/python/csrc/sherpa-onnx.cc
@@ -4,6 +4,7 @@

 #include "sherpa-onnx/python/csrc/sherpa-onnx.h"

+#include "sherpa-onnx/python/csrc/circular-buffer.h"
 #include "sherpa-onnx/python/csrc/display.h"
 #include "sherpa-onnx/python/csrc/endpoint.h"
 #include "sherpa-onnx/python/csrc/features.h"
@@ -15,6 +16,9 @@
 #include "sherpa-onnx/python/csrc/online-model-config.h"
 #include "sherpa-onnx/python/csrc/online-recognizer.h"
 #include "sherpa-onnx/python/csrc/online-stream.h"
+#include "sherpa-onnx/python/csrc/vad-model-config.h"
+#include "sherpa-onnx/python/csrc/vad-model.h"
+#include "sherpa-onnx/python/csrc/voice-activity-detector.h"

 namespace sherpa_onnx {

@@ -34,6 +38,11 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
  PybindOfflineLMConfig(&m);
  PybindOfflineModelConfig(&m);
  PybindOfflineRecognizer(&m);
+
+  PybindVadModelConfig(&m);
+  PybindVadModel(&m);
+  PybindCircularBuffer(&m);
+  PybindVoiceActivityDetector(&m);
 }

 }  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/silero-vad-model-config.cc
+++ b/sherpa-onnx/python/csrc/silero-vad-model-config.cc
@@ -0,0 +1,43 @@
+// sherpa-onnx/python/csrc/silero-vad-model-config.cc
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
+
+#include <memory>
+#include <string>
+
+#include "sherpa-onnx/csrc/silero-vad-model-config.h"
+
+namespace sherpa_onnx {
+
+void PybindSileroVadModelConfig(py::module *m) {
+  using PyClass = SileroVadModelConfig;
+  py::class_<PyClass>(*m, "SileroVadModelConfig")
+      .def(py::init<>())
+      .def(py::init([](const std::string &model, float threshold,
+                       float min_silence_duration, float min_speech_duration,
+                       int32_t window_size) -> std::unique_ptr<PyClass> {
+             auto ans = std::make_unique<PyClass>();
+
+             ans->model = model;
+             ans->threshold = threshold;
+             ans->min_silence_duration = min_silence_duration;
+             ans->min_speech_duration = min_speech_duration;
+             ans->window_size = window_size;
+
+             return ans;
+           }),
+           py::arg("model"), py::arg("threshold") = 0.5,
+           py::arg("min_silence_duration") = 0.5,
+           py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512)
+      .def_readwrite("model", &PyClass::model)
+      .def_readwrite("threshold", &PyClass::threshold)
+      .def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
+      .def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
+      .def_readwrite("window_size", &PyClass::window_size)
+      .def("__str__", &PyClass::ToString)
+      .def("validate", &PyClass::Validate);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/silero-vad-model-config.h
+++ b/sherpa-onnx/python/csrc/silero-vad-model-config.h
@@ -0,0 +1,16 @@
+// sherpa-onnx/python/csrc/silero-vad-model-config.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
+#define SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindSileroVadModelConfig(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
--- a/sherpa-onnx/python/csrc/vad-model-config.cc
+++ b/sherpa-onnx/python/csrc/vad-model-config.cc
@@ -0,0 +1,34 @@
+// sherpa-onnx/python/csrc/vad-model-config.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/vad-model-config.h"
+
+#include <string>
+
+#include "sherpa-onnx/csrc/vad-model-config.h"
+#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
+
+namespace sherpa_onnx {
+
+void PybindVadModelConfig(py::module *m) {
+  PybindSileroVadModelConfig(m);
+
+  using PyClass = VadModelConfig;
+  py::class_<PyClass>(*m, "VadModelConfig")
+      .def(py::init<>())
+      .def(py::init<const SileroVadModelConfig &, int32_t, int32_t,
+                    const std::string &, bool>(),
+           py::arg("silero_vad"), py::arg("sample_rate") = 16000,
+           py::arg("num_threads") = 1, py::arg("provider") = "cpu",
+           py::arg("debug") = false)
+      .def_readwrite("silero_vad", &PyClass::silero_vad)
+      .def_readwrite("sample_rate", &PyClass::sample_rate)
+      .def_readwrite("num_threads", &PyClass::num_threads)
+      .def_readwrite("provider", &PyClass::provider)
+      .def_readwrite("debug", &PyClass::debug)
+      .def("__str__", &PyClass::ToString)
+      .def("validate", &PyClass::Validate);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/vad-model-config.h
+++ b/sherpa-onnx/python/csrc/vad-model-config.h
@@ -0,0 +1,16 @@
+// sherpa-onnx/python/csrc/vad-model-config.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
+#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindVadModelConfig(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
--- a/sherpa-onnx/python/csrc/vad-model.cc
+++ b/sherpa-onnx/python/csrc/vad-model.cc
@@ -0,0 +1,29 @@
+// sherpa-onnx/python/csrc/vad-model.cc
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/vad-model.h"
+
+#include <vector>
+
+#include "sherpa-onnx/csrc/vad-model.h"
+
+namespace sherpa_onnx {
+
+void PybindVadModel(py::module *m) {
+  using PyClass = VadModel;
+  py::class_<PyClass>(*m, "VadModel")
+      .def_static("create", &PyClass::Create, py::arg("config"))
+      .def("reset", &PyClass::Reset)
+      .def(
+          "is_speech",
+          [](PyClass &self, const std::vector<float> &samples) -> bool {
+            return self.IsSpeech(samples.data(), samples.size());
+          },
+          py::arg("samples"))
+      .def("window_size", &PyClass::WindowSize)
+      .def("min_silence_duration_samples", &PyClass::MinSilenceDurationSamples)
+      .def("min_speech_duration_samples", &PyClass::MinSpeechDurationSamples);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/vad-model.h
+++ b/sherpa-onnx/python/csrc/vad-model.h
@@ -0,0 +1,16 @@
+// sherpa-onnx/python/csrc/vad-model.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
+#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindVadModel(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
--- a/sherpa-onnx/python/csrc/voice-activity-detector.cc
+++ b/sherpa-onnx/python/csrc/voice-activity-detector.cc
@@ -0,0 +1,41 @@
+// sherpa-onnx/python/csrc/voice-activity-detector.cc
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
+
+#include <vector>
+
+#include "sherpa-onnx/csrc/voice-activity-detector.h"
+
+namespace sherpa_onnx {
+
+void PybindSpeechSegment(py::module *m) {
+  using PyClass = SpeechSegment;
+  py::class_<PyClass>(*m, "SpeechSegment")
+      .def_property_readonly("start",
+                             [](const PyClass &self) { return self.start; })
+      .def_property_readonly("samples",
+                             [](const PyClass &self) { return self.samples; });
+}
+
+void PybindVoiceActivityDetector(py::module *m) {
+  PybindSpeechSegment(m);
+  using PyClass = VoiceActivityDetector;
+  py::class_<PyClass>(*m, "VoiceActivityDetector")
+      .def(py::init<const VadModelConfig &, float>(), py::arg("config"),
+           py::arg("buffer_size_in_seconds") = 60)
+      .def(
+          "accept_waveform",
+          [](PyClass &self, const std::vector<float> &samples) {
+            self.AcceptWaveform(samples.data(), samples.size());
+          },
+          py::arg("samples"))
+      .def("empty", &PyClass::Empty)
+      .def("pop", &PyClass::Pop)
+      .def("is_speech_detected", &PyClass::IsSpeechDetected)
+      .def("reset", &PyClass::Reset)
+      .def_property_readonly("front", &PyClass::Front);
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/python/csrc/voice-activity-detector.h
+++ b/sherpa-onnx/python/csrc/voice-activity-detector.h
@@ -0,0 +1,16 @@
+// sherpa-onnx/python/csrc/voice-activity-detector.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
+#define SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
+
+#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
+
+namespace sherpa_onnx {
+
+void PybindVoiceActivityDetector(py::module *m);
+
+}
+
+#endif  // SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
--- a/sherpa-onnx/python/sherpa_onnx/init.py
+++ b/sherpa-onnx/python/sherpa_onnx/init.py
@@ -1,6 +1,16 @@
 from typing import Dict, List, Optional

-from _sherpa_onnx import Display, OfflineStream, OnlineStream
+from _sherpa_onnx import (
+    CircularBuffer,
+    Display,
+    OfflineStream,
+    OnlineStream,
+    SileroVadModelConfig,
+    SpeechSegment,
+    VadModel,
+    VadModelConfig,
+    VoiceActivityDetector,
+)

 from .offline_recognizer import OfflineRecognizer
 from .online_recognizer import OnlineRecognizer