Add CXX API for VAD (#2077)

This commit is contained in:
Fangjun Kuang
2025-04-01 14:51:43 +08:00
committed by GitHub
parent 6ef9aeb8d8
commit 0703bc1b86
10 changed files with 455 additions and 30 deletions

View File

@@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
// in seconds
float min_speech_duration;
int window_size;
int32_t window_size;
// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,

View File

@@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const {
return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_);
}
CircularBuffer CircularBuffer::Create(int32_t capacity) {
auto p = SherpaOnnxCreateCircularBuffer(capacity);
return CircularBuffer(p);
}
CircularBuffer::CircularBuffer(const SherpaOnnxCircularBuffer *p)
: MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer>(p) {}
void CircularBuffer::Destroy(const SherpaOnnxCircularBuffer *p) const {
SherpaOnnxDestroyCircularBuffer(p);
}
void CircularBuffer::Push(const float *samples, int32_t n) const {
SherpaOnnxCircularBufferPush(p_, samples, n);
}
std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
const float *samples = SherpaOnnxCircularBufferGet(p_, start_index, n);
std::vector<float> ans(n);
std::copy(samples, samples + n, ans.begin());
SherpaOnnxCircularBufferFree(samples);
return ans;
}
void CircularBuffer::Pop(int32_t n) const {
SherpaOnnxCircularBufferPop(p_, n);
}
int32_t CircularBuffer::Size() const {
return SherpaOnnxCircularBufferSize(p_);
}
int32_t CircularBuffer::Head() const {
return SherpaOnnxCircularBufferHead(p_);
}
void CircularBuffer::Reset() const { SherpaOnnxCircularBufferReset(p_); }
VoiceActivityDetector VoiceActivityDetector::Create(
const VadModelConfig &config, float buffer_size_in_seconds) {
struct SherpaOnnxVadModelConfig c;
memset(&c, 0, sizeof(c));
c.silero_vad.model = config.silero_vad.model.c_str();
c.silero_vad.threshold = config.silero_vad.threshold;
c.silero_vad.min_silence_duration = config.silero_vad.min_silence_duration;
c.silero_vad.min_speech_duration = config.silero_vad.min_speech_duration;
c.silero_vad.window_size = config.silero_vad.window_size;
c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration;
c.sample_rate = config.sample_rate;
c.num_threads = config.num_threads;
c.provider = config.provider.c_str();
c.debug = config.debug;
auto p = SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds);
return VoiceActivityDetector(p);
}
VoiceActivityDetector::VoiceActivityDetector(
const SherpaOnnxVoiceActivityDetector *p)
: MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector>(p) {}
void VoiceActivityDetector::Destroy(
const SherpaOnnxVoiceActivityDetector *p) const {
SherpaOnnxDestroyVoiceActivityDetector(p);
}
void VoiceActivityDetector::AcceptWaveform(const float *samples,
int32_t n) const {
SherpaOnnxVoiceActivityDetectorAcceptWaveform(p_, samples, n);
}
bool VoiceActivityDetector::IsEmpty() const {
return SherpaOnnxVoiceActivityDetectorEmpty(p_);
}
bool VoiceActivityDetector ::IsDetected() const {
return SherpaOnnxVoiceActivityDetectorDetected(p_);
}
void VoiceActivityDetector::Pop() const {
SherpaOnnxVoiceActivityDetectorPop(p_);
}
void VoiceActivityDetector::Clear() const {
SherpaOnnxVoiceActivityDetectorClear(p_);
}
SpeechSegment VoiceActivityDetector::Front() const {
auto f = SherpaOnnxVoiceActivityDetectorFront(p_);
SpeechSegment segment;
segment.start = f->start;
segment.samples = std::vector<float>{f->samples, f->samples + f->n};
SherpaOnnxDestroySpeechSegment(f);
return segment;
}
void VoiceActivityDetector::Reset() const {
SherpaOnnxVoiceActivityDetectorReset(p_);
}
void VoiceActivityDetector::Flush() const {
SherpaOnnxVoiceActivityDetectorFlush(p_);
}
} // namespace sherpa_onnx::cxx

View File

@@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser
explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p);
};
// ==============================
// VAD
// ==============================
struct SileroVadModelConfig {
std::string model;
float threshold = 0.5;
float min_silence_duration = 0.5;
float min_speech_duration = 0.25;
int32_t window_size = 512;
float max_speech_duration = 20;
};
struct VadModelConfig {
SileroVadModelConfig silero_vad;
int32_t sample_rate = 16000;
int32_t num_threads = 1;
std::string provider = "cpu";
bool debug = false;
};
struct SpeechSegment {
int32_t start;
std::vector<float> samples;
};
class SHERPA_ONNX_API CircularBuffer
: public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> {
public:
static CircularBuffer Create(int32_t capacity);
void Destroy(const SherpaOnnxCircularBuffer *p) const;
void Push(const float *p, int32_t n) const;
std::vector<float> Get(int32_t start_index, int32_t n) const;
void Pop(int32_t n) const;
int32_t Size() const;
int32_t Head() const;
void Reset() const;
private:
explicit CircularBuffer(const SherpaOnnxCircularBuffer *p);
};
class SHERPA_ONNX_API VoiceActivityDetector
: public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> {
public:
static VoiceActivityDetector Create(const VadModelConfig &config,
float buffer_size_in_seconds);
void Destroy(const SherpaOnnxVoiceActivityDetector *p) const;
void AcceptWaveform(const float *samples, int32_t n) const;
bool IsEmpty() const;
bool IsDetected() const;
void Pop() const;
void Clear() const;
SpeechSegment Front() const;
void Reset() const;
void Flush() const;
private:
explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
};
} // namespace sherpa_onnx::cxx
#endif // SHERPA_ONNX_C_API_CXX_API_H_