Add Flush to VAD so that the last segment can be detected. (#1099)
This commit is contained in:
@@ -876,6 +876,10 @@ void SherpaOnnxVoiceActivityDetectorReset(SherpaOnnxVoiceActivityDetector *p) {
|
||||
p->impl->Reset();
|
||||
}
|
||||
|
||||
void SherpaOnnxVoiceActivityDetectorFlush(SherpaOnnxVoiceActivityDetector *p) {
|
||||
p->impl->Flush();
|
||||
}
|
||||
|
||||
#if SHERPA_ONNX_ENABLE_TTS == 1
|
||||
struct SherpaOnnxOfflineTts {
|
||||
std::unique_ptr<sherpa_onnx::OfflineTts> impl;
|
||||
|
||||
@@ -815,6 +815,9 @@ SHERPA_ONNX_API void SherpaOnnxDestroySpeechSegment(
|
||||
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorReset(
|
||||
SherpaOnnxVoiceActivityDetector *p);
|
||||
|
||||
SHERPA_ONNX_API void SherpaOnnxVoiceActivityDetectorFlush(
|
||||
SherpaOnnxVoiceActivityDetector *p);
|
||||
|
||||
// ============================================================
|
||||
// For offline Text-to-Speech (i.e., non-streaming TTS)
|
||||
// ============================================================
|
||||
|
||||
@@ -118,6 +118,29 @@ class VoiceActivityDetector::Impl {
|
||||
start_ = -1;
|
||||
}
|
||||
|
||||
void Flush() {
|
||||
if (start_ == -1 || buffer_.Size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t end = buffer_.Tail() - model_->MinSilenceDurationSamples();
|
||||
if (end <= start_) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<float> s = buffer_.Get(start_, end - start_);
|
||||
|
||||
SpeechSegment segment;
|
||||
|
||||
segment.start = start_;
|
||||
segment.samples = std::move(s);
|
||||
|
||||
segments_.push(std::move(segment));
|
||||
|
||||
buffer_.Pop(end - buffer_.Head());
|
||||
start_ = -1;
|
||||
}
|
||||
|
||||
bool IsSpeechDetected() const { return start_ != -1; }
|
||||
|
||||
const VadModelConfig &GetConfig() const { return config_; }
|
||||
@@ -164,7 +187,9 @@ const SpeechSegment &VoiceActivityDetector::Front() const {
|
||||
return impl_->Front();
|
||||
}
|
||||
|
||||
void VoiceActivityDetector::Reset() { impl_->Reset(); }
|
||||
void VoiceActivityDetector::Reset() const { impl_->Reset(); }
|
||||
|
||||
void VoiceActivityDetector::Flush() const { impl_->Flush(); }
|
||||
|
||||
bool VoiceActivityDetector::IsSpeechDetected() const {
|
||||
return impl_->IsSpeechDetected();
|
||||
|
||||
@@ -41,7 +41,11 @@ class VoiceActivityDetector {
|
||||
|
||||
bool IsSpeechDetected() const;
|
||||
|
||||
void Reset();
|
||||
void Reset() const;
|
||||
|
||||
// At the end of the utterance, you can invoke this method so that
|
||||
// the last speech segment can be detected.
|
||||
void Flush() const;
|
||||
|
||||
const VadModelConfig &GetConfig() const;
|
||||
|
||||
|
||||
@@ -46,6 +46,10 @@ public class Vad {
|
||||
reset(this.ptr);
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
flush(this.ptr);
|
||||
}
|
||||
|
||||
public SpeechSegment front() {
|
||||
Object[] arr = front(this.ptr);
|
||||
int start = (int) arr[0];
|
||||
@@ -75,4 +79,6 @@ public class Vad {
|
||||
private native boolean isSpeechDetected(long ptr);
|
||||
|
||||
private native void reset(long ptr);
|
||||
|
||||
private native void flush(long ptr);
|
||||
}
|
||||
|
||||
@@ -173,3 +173,11 @@ JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_reset(JNIEnv * /*env*/,
|
||||
auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
|
||||
model->Reset();
|
||||
}
|
||||
|
||||
SHERPA_ONNX_EXTERN_C
|
||||
JNIEXPORT void JNICALL Java_com_k2fsa_sherpa_onnx_Vad_flush(JNIEnv * /*env*/,
|
||||
jobject /*obj*/,
|
||||
jlong ptr) {
|
||||
auto model = reinterpret_cast<sherpa_onnx::VoiceActivityDetector *>(ptr);
|
||||
model->Flush();
|
||||
}
|
||||
|
||||
@@ -52,6 +52,8 @@ class Vad(
|
||||
|
||||
fun reset() = reset(ptr)
|
||||
|
||||
fun flush() = flush(ptr)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun newFromAsset(
|
||||
@@ -70,6 +72,7 @@ class Vad(
|
||||
private external fun front(ptr: Long): Array<Any>
|
||||
private external fun isSpeechDetected(ptr: Long): Boolean
|
||||
private external fun reset(ptr: Long)
|
||||
private external fun flush(ptr: Long)
|
||||
|
||||
companion object {
|
||||
init {
|
||||
|
||||
@@ -38,6 +38,7 @@ void PybindVoiceActivityDetector(py::module *m) {
|
||||
.def("is_speech_detected", &PyClass::IsSpeechDetected,
|
||||
py::call_guard<py::gil_scoped_release>())
|
||||
.def("reset", &PyClass::Reset, py::call_guard<py::gil_scoped_release>())
|
||||
.def("flush", &PyClass::Flush, py::call_guard<py::gil_scoped_release>())
|
||||
.def_property_readonly("front", &PyClass::Front);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user