Add Silero VAD (#313)

This commit is contained in:
Fangjun Kuang
2023-09-17 14:54:38 +08:00
committed by GitHub
parent 3a20e332bf
commit c471423125
36 changed files with 1683 additions and 16 deletions

View File

@@ -1,6 +1,7 @@
include_directories(${CMAKE_SOURCE_DIR})
pybind11_add_module(_sherpa_onnx
circular-buffer.cc
display.cc
endpoint.cc
features.cc
@@ -20,6 +21,10 @@ pybind11_add_module(_sherpa_onnx
online-stream.cc
online-transducer-model-config.cc
sherpa-onnx.cc
silero-vad-model-config.cc
vad-model-config.cc
vad-model.cc
voice-activity-detector.cc
)
if(APPLE)

View File

@@ -0,0 +1,31 @@
// sherpa-onnx/python/csrc/circular-buffer.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/circular-buffer.h"
#include <vector>
#include "sherpa-onnx/csrc/circular-buffer.h"
namespace sherpa_onnx {
void PybindCircularBuffer(py::module *m) {
using PyClass = CircularBuffer;
py::class_<PyClass>(*m, "CircularBuffer")
.def(py::init<int32_t>(), py::arg("capacity"))
.def(
"push",
[](PyClass &self, const std::vector<float> &samples) {
self.Push(samples.data(), samples.size());
},
py::arg("samples"))
.def("get", &PyClass::Get, py::arg("start_index"), py::arg("n"))
.def("pop", &PyClass::Pop, py::arg("n"))
.def("reset", &PyClass::Reset)
.def_property_readonly("size", &PyClass::Size)
.def_property_readonly("head", &PyClass::Head)
.def_property_readonly("tail", &PyClass::Tail);
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,16 @@
// sherpa-onnx/python/csrc/circular-buffer.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
#define SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindCircularBuffer(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_CIRCULAR_BUFFER_H_

View File

@@ -4,6 +4,7 @@
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
#include "sherpa-onnx/python/csrc/circular-buffer.h"
#include "sherpa-onnx/python/csrc/display.h"
#include "sherpa-onnx/python/csrc/endpoint.h"
#include "sherpa-onnx/python/csrc/features.h"
@@ -15,6 +16,9 @@
#include "sherpa-onnx/python/csrc/online-model-config.h"
#include "sherpa-onnx/python/csrc/online-recognizer.h"
#include "sherpa-onnx/python/csrc/online-stream.h"
#include "sherpa-onnx/python/csrc/vad-model-config.h"
#include "sherpa-onnx/python/csrc/vad-model.h"
#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
namespace sherpa_onnx {
@@ -34,6 +38,11 @@ PYBIND11_MODULE(_sherpa_onnx, m) {
PybindOfflineLMConfig(&m);
PybindOfflineModelConfig(&m);
PybindOfflineRecognizer(&m);
PybindVadModelConfig(&m);
PybindVadModel(&m);
PybindCircularBuffer(&m);
PybindVoiceActivityDetector(&m);
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,43 @@
// sherpa-onnx/python/csrc/silero-vad-model-config.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
#include <memory>
#include <string>
#include "sherpa-onnx/csrc/silero-vad-model-config.h"
namespace sherpa_onnx {
void PybindSileroVadModelConfig(py::module *m) {
using PyClass = SileroVadModelConfig;
py::class_<PyClass>(*m, "SileroVadModelConfig")
.def(py::init<>())
.def(py::init([](const std::string &model, float threshold,
float min_silence_duration, float min_speech_duration,
int32_t window_size) -> std::unique_ptr<PyClass> {
auto ans = std::make_unique<PyClass>();
ans->model = model;
ans->threshold = threshold;
ans->min_silence_duration = min_silence_duration;
ans->min_speech_duration = min_speech_duration;
ans->window_size = window_size;
return ans;
}),
py::arg("model"), py::arg("threshold") = 0.5,
py::arg("min_silence_duration") = 0.5,
py::arg("min_speech_duration") = 0.25, py::arg("window_size") = 512)
.def_readwrite("model", &PyClass::model)
.def_readwrite("threshold", &PyClass::threshold)
.def_readwrite("min_silence_duration", &PyClass::min_silence_duration)
.def_readwrite("min_speech_duration", &PyClass::min_speech_duration)
.def_readwrite("window_size", &PyClass::window_size)
.def("__str__", &PyClass::ToString)
.def("validate", &PyClass::Validate);
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,16 @@
// sherpa-onnx/python/csrc/silero-vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
#define SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindSileroVadModelConfig(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_SILERO_VAD_MODEL_CONFIG_H_

View File

@@ -0,0 +1,34 @@
// sherpa-onnx/python/csrc/vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/vad-model-config.h"
#include <string>
#include "sherpa-onnx/csrc/vad-model-config.h"
#include "sherpa-onnx/python/csrc/silero-vad-model-config.h"
namespace sherpa_onnx {
void PybindVadModelConfig(py::module *m) {
PybindSileroVadModelConfig(m);
using PyClass = VadModelConfig;
py::class_<PyClass>(*m, "VadModelConfig")
.def(py::init<>())
.def(py::init<const SileroVadModelConfig &, int32_t, int32_t,
const std::string &, bool>(),
py::arg("silero_vad"), py::arg("sample_rate") = 16000,
py::arg("num_threads") = 1, py::arg("provider") = "cpu",
py::arg("debug") = false)
.def_readwrite("silero_vad", &PyClass::silero_vad)
.def_readwrite("sample_rate", &PyClass::sample_rate)
.def_readwrite("num_threads", &PyClass::num_threads)
.def_readwrite("provider", &PyClass::provider)
.def_readwrite("debug", &PyClass::debug)
.def("__str__", &PyClass::ToString)
.def("validate", &PyClass::Validate);
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,16 @@
// sherpa-onnx/python/csrc/vad-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindVadModelConfig(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_CONFIG_H_

View File

@@ -0,0 +1,29 @@
// sherpa-onnx/python/csrc/vad-model.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/vad-model.h"
#include <vector>
#include "sherpa-onnx/csrc/vad-model.h"
namespace sherpa_onnx {
void PybindVadModel(py::module *m) {
using PyClass = VadModel;
py::class_<PyClass>(*m, "VadModel")
.def_static("create", &PyClass::Create, py::arg("config"))
.def("reset", &PyClass::Reset)
.def(
"is_speech",
[](PyClass &self, const std::vector<float> &samples) -> bool {
return self.IsSpeech(samples.data(), samples.size());
},
py::arg("samples"))
.def("window_size", &PyClass::WindowSize)
.def("min_silence_duration_samples", &PyClass::MinSilenceDurationSamples)
.def("min_speech_duration_samples", &PyClass::MinSpeechDurationSamples);
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,16 @@
// sherpa-onnx/python/csrc/vad-model.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
#define SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindVadModel(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_VAD_MODEL_H_

View File

@@ -0,0 +1,41 @@
// sherpa-onnx/python/csrc/voice-activity-detector.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/voice-activity-detector.h"
#include <vector>
#include "sherpa-onnx/csrc/voice-activity-detector.h"
namespace sherpa_onnx {
void PybindSpeechSegment(py::module *m) {
using PyClass = SpeechSegment;
py::class_<PyClass>(*m, "SpeechSegment")
.def_property_readonly("start",
[](const PyClass &self) { return self.start; })
.def_property_readonly("samples",
[](const PyClass &self) { return self.samples; });
}
void PybindVoiceActivityDetector(py::module *m) {
PybindSpeechSegment(m);
using PyClass = VoiceActivityDetector;
py::class_<PyClass>(*m, "VoiceActivityDetector")
.def(py::init<const VadModelConfig &, float>(), py::arg("config"),
py::arg("buffer_size_in_seconds") = 60)
.def(
"accept_waveform",
[](PyClass &self, const std::vector<float> &samples) {
self.AcceptWaveform(samples.data(), samples.size());
},
py::arg("samples"))
.def("empty", &PyClass::Empty)
.def("pop", &PyClass::Pop)
.def("is_speech_detected", &PyClass::IsSpeechDetected)
.def("reset", &PyClass::Reset)
.def_property_readonly("front", &PyClass::Front);
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,16 @@
// sherpa-onnx/python/csrc/voice-activity-detector.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
#define SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindVoiceActivityDetector(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_VOICE_ACTIVITY_DETECTOR_H_

View File

@@ -1,6 +1,16 @@
from typing import Dict, List, Optional
from _sherpa_onnx import Display, OfflineStream, OnlineStream
from _sherpa_onnx import (
CircularBuffer,
Display,
OfflineStream,
OnlineStream,
SileroVadModelConfig,
SpeechSegment,
VadModel,
VadModelConfig,
VoiceActivityDetector,
)
from .offline_recognizer import OfflineRecognizer
from .online_recognizer import OnlineRecognizer