Add CXX API for VAD (#2077)
This commit is contained in:
@@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
|
||||
// in seconds
|
||||
float min_speech_duration;
|
||||
|
||||
int window_size;
|
||||
int32_t window_size;
|
||||
|
||||
// If a speech segment is longer than this value, then we increase
|
||||
// the threshold to 0.9. After finishing detecting the segment,
|
||||
|
||||
@@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const {
|
||||
return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_);
|
||||
}
|
||||
|
||||
CircularBuffer CircularBuffer::Create(int32_t capacity) {
|
||||
auto p = SherpaOnnxCreateCircularBuffer(capacity);
|
||||
return CircularBuffer(p);
|
||||
}
|
||||
|
||||
CircularBuffer::CircularBuffer(const SherpaOnnxCircularBuffer *p)
|
||||
: MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer>(p) {}
|
||||
|
||||
void CircularBuffer::Destroy(const SherpaOnnxCircularBuffer *p) const {
|
||||
SherpaOnnxDestroyCircularBuffer(p);
|
||||
}
|
||||
|
||||
void CircularBuffer::Push(const float *samples, int32_t n) const {
|
||||
SherpaOnnxCircularBufferPush(p_, samples, n);
|
||||
}
|
||||
|
||||
std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
|
||||
const float *samples = SherpaOnnxCircularBufferGet(p_, start_index, n);
|
||||
std::vector<float> ans(n);
|
||||
std::copy(samples, samples + n, ans.begin());
|
||||
|
||||
SherpaOnnxCircularBufferFree(samples);
|
||||
return ans;
|
||||
}
|
||||
|
||||
void CircularBuffer::Pop(int32_t n) const {
|
||||
SherpaOnnxCircularBufferPop(p_, n);
|
||||
}
|
||||
|
||||
int32_t CircularBuffer::Size() const {
|
||||
return SherpaOnnxCircularBufferSize(p_);
|
||||
}
|
||||
|
||||
int32_t CircularBuffer::Head() const {
|
||||
return SherpaOnnxCircularBufferHead(p_);
|
||||
}
|
||||
|
||||
void CircularBuffer::Reset() const { SherpaOnnxCircularBufferReset(p_); }
|
||||
|
||||
VoiceActivityDetector VoiceActivityDetector::Create(
|
||||
const VadModelConfig &config, float buffer_size_in_seconds) {
|
||||
struct SherpaOnnxVadModelConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
c.silero_vad.model = config.silero_vad.model.c_str();
|
||||
c.silero_vad.threshold = config.silero_vad.threshold;
|
||||
c.silero_vad.min_silence_duration = config.silero_vad.min_silence_duration;
|
||||
c.silero_vad.min_speech_duration = config.silero_vad.min_speech_duration;
|
||||
c.silero_vad.window_size = config.silero_vad.window_size;
|
||||
c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration;
|
||||
|
||||
c.sample_rate = config.sample_rate;
|
||||
c.num_threads = config.num_threads;
|
||||
c.provider = config.provider.c_str();
|
||||
c.debug = config.debug;
|
||||
|
||||
auto p = SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds);
|
||||
return VoiceActivityDetector(p);
|
||||
}
|
||||
|
||||
VoiceActivityDetector::VoiceActivityDetector(
|
||||
const SherpaOnnxVoiceActivityDetector *p)
|
||||
: MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector>(p) {}
|
||||
|
||||
void VoiceActivityDetector::Destroy(
|
||||
const SherpaOnnxVoiceActivityDetector *p) const {
|
||||
SherpaOnnxDestroyVoiceActivityDetector(p);
|
||||
}
|
||||
|
||||
void VoiceActivityDetector::AcceptWaveform(const float *samples,
|
||||
int32_t n) const {
|
||||
SherpaOnnxVoiceActivityDetectorAcceptWaveform(p_, samples, n);
|
||||
}
|
||||
|
||||
bool VoiceActivityDetector::IsEmpty() const {
|
||||
return SherpaOnnxVoiceActivityDetectorEmpty(p_);
|
||||
}
|
||||
|
||||
bool VoiceActivityDetector ::IsDetected() const {
|
||||
return SherpaOnnxVoiceActivityDetectorDetected(p_);
|
||||
}
|
||||
|
||||
void VoiceActivityDetector::Pop() const {
|
||||
SherpaOnnxVoiceActivityDetectorPop(p_);
|
||||
}
|
||||
|
||||
void VoiceActivityDetector::Clear() const {
|
||||
SherpaOnnxVoiceActivityDetectorClear(p_);
|
||||
}
|
||||
|
||||
SpeechSegment VoiceActivityDetector::Front() const {
|
||||
auto f = SherpaOnnxVoiceActivityDetectorFront(p_);
|
||||
|
||||
SpeechSegment segment;
|
||||
segment.start = f->start;
|
||||
segment.samples = std::vector<float>{f->samples, f->samples + f->n};
|
||||
|
||||
SherpaOnnxDestroySpeechSegment(f);
|
||||
|
||||
return segment;
|
||||
}
|
||||
|
||||
void VoiceActivityDetector::Reset() const {
|
||||
SherpaOnnxVoiceActivityDetectorReset(p_);
|
||||
}
|
||||
|
||||
void VoiceActivityDetector::Flush() const {
|
||||
SherpaOnnxVoiceActivityDetectorFlush(p_);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx::cxx
|
||||
|
||||
@@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser
|
||||
explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p);
|
||||
};
|
||||
|
||||
// ==============================
|
||||
// VAD
|
||||
// ==============================
|
||||
|
||||
struct SileroVadModelConfig {
|
||||
std::string model;
|
||||
float threshold = 0.5;
|
||||
float min_silence_duration = 0.5;
|
||||
float min_speech_duration = 0.25;
|
||||
int32_t window_size = 512;
|
||||
float max_speech_duration = 20;
|
||||
};
|
||||
|
||||
struct VadModelConfig {
|
||||
SileroVadModelConfig silero_vad;
|
||||
|
||||
int32_t sample_rate = 16000;
|
||||
int32_t num_threads = 1;
|
||||
std::string provider = "cpu";
|
||||
bool debug = false;
|
||||
};
|
||||
|
||||
struct SpeechSegment {
|
||||
int32_t start;
|
||||
std::vector<float> samples;
|
||||
};
|
||||
|
||||
class SHERPA_ONNX_API CircularBuffer
|
||||
: public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> {
|
||||
public:
|
||||
static CircularBuffer Create(int32_t capacity);
|
||||
|
||||
void Destroy(const SherpaOnnxCircularBuffer *p) const;
|
||||
|
||||
void Push(const float *p, int32_t n) const;
|
||||
|
||||
std::vector<float> Get(int32_t start_index, int32_t n) const;
|
||||
|
||||
void Pop(int32_t n) const;
|
||||
|
||||
int32_t Size() const;
|
||||
|
||||
int32_t Head() const;
|
||||
|
||||
void Reset() const;
|
||||
|
||||
private:
|
||||
explicit CircularBuffer(const SherpaOnnxCircularBuffer *p);
|
||||
};
|
||||
|
||||
class SHERPA_ONNX_API VoiceActivityDetector
|
||||
: public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> {
|
||||
public:
|
||||
static VoiceActivityDetector Create(const VadModelConfig &config,
|
||||
float buffer_size_in_seconds);
|
||||
|
||||
void Destroy(const SherpaOnnxVoiceActivityDetector *p) const;
|
||||
|
||||
void AcceptWaveform(const float *samples, int32_t n) const;
|
||||
|
||||
bool IsEmpty() const;
|
||||
|
||||
bool IsDetected() const;
|
||||
|
||||
void Pop() const;
|
||||
|
||||
void Clear() const;
|
||||
|
||||
SpeechSegment Front() const;
|
||||
|
||||
void Reset() const;
|
||||
|
||||
void Flush() const;
|
||||
|
||||
private:
|
||||
explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx::cxx
|
||||
|
||||
#endif // SHERPA_ONNX_C_API_CXX_API_H_
|
||||
|
||||
Reference in New Issue
Block a user