Add Flush to VAD so that the last segment can be detected. (#1099)

This commit is contained in:
Fangjun Kuang
2024-07-09 16:15:56 +08:00
committed by GitHub
parent 3e4307e2fb
commit c2cc9dec58
35 changed files with 237 additions and 29 deletions

View File

@@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
p->impl->Reset();
}
void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) {
p->impl->Flush();
}
#if SHERPA_ONNX_ENABLE_TTS == 1
struct SherpaOnnxOfflineTts {
std::unique_ptr<sherpa_onnx::OfflineTts> impl;

View File

@@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
SherpaOnnxVoiceActivityDetector *p);
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush(
SherpaOnnxVoiceActivityDetector *p);
// ============================================================
// For offline Text-to-Speech (i.e., non-streaming TTS)
// ============================================================

View File

@@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl {
start_ = -1;
}
void Flush() {
if (start_ == -1 || buffer_.Size() == 0) {
return;
}
int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
if (end <= start_) {
return;
}
std::vector<float> s = buffer_.Get(start_, end - start_);
SpeechSegment segment;
segment.start = start_;
segment.samples = std::move(s);
segments_.push(std::move(segment));
buffer_.Pop(end - buffer_.Head());
start_ = -1;
}
bool IsSpeechDetected() const { return start_ != -1; }
const VadModelConfig &GetConfig() const { return config_; }
@@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const {
return impl_->Front();
}
void VoiceActivityDetector::Reset() { impl_->Reset(); }
void VoiceActivityDetector::Reset() const { impl_->Reset(); }
void VoiceActivityDetector::Flush() const { impl_->Flush(); }
bool VoiceActivityDetector::IsSpeechDetected() const {
return impl_->IsSpeechDetected();

View File

@@ -41,7 +41,11 @@ class VoiceActivityDetector {
bool IsSpeechDetected() const;
void Reset();
void Reset() const;
// At the end of the utterance, you can invoke this method so that
// the last speech segment can be detected.
void Flush() const;
const VadModelConfig &GetConfig() const;

View File

@@ -46,6 +46,10 @@ public class Vad {
reset(this.ptr);
}
public void flush() {
flush(this.ptr);
}
public SpeechSegment front() {
Object[] arr = front(this.ptr);
int start = (int) arr[0];
@@ -75,4 +79,6 @@ public class Vad {
private native boolean isSpeechDetected(long ptr);
private native void reset(long ptr);
private native void flush(long ptr);
}

View File

@@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/,
auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
model->Reset();
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/,
jobject /*obj*/,
jlong ptr) {
auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
model->Flush();
}

View File

@@ -52,6 +52,8 @@ class Vad(
fun reset() = reset(ptr)
fun flush() = flush(ptr)
private external fun delete(ptr: Long)
private external fun newFromAsset(
@@ -70,6 +72,7 @@ class Vad(
private external fun front(ptr: Long): Array<Any>
private external fun isSpeechDetected(ptr: Long): Boolean
private external fun reset(ptr: Long)
private external fun flush(ptr: Long)
companion object {
init {

View File

@@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) {
.def("is_speech_detected", &PyClass::IsSpeechDetected,
py::call_guard<py::gil_scoped_release>())
.def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
.def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
.def_property_readonly("front", &PyClass::Front);
}