diff --git a/.github/workflows/cxx-api.yaml b/.github/workflows/cxx-api.yaml index 519b5113..6e1e20f2 100644 --- a/.github/workflows/cxx-api.yaml +++ b/.github/workflows/cxx-api.yaml @@ -81,6 +81,45 @@ jobs: otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib fi + - name: Test VAD + shell: bash + run: | + name=vad-cxx-api + g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-cxx-api \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh $name + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then + ldd ./$name + echo "----" + readelf -d ./$name + fi + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + + ./$name + + mkdir vad-test + cp -v lei-jun-test*.wav vad-test + + ls -lh vad-test + + rm $name + + - uses: actions/upload-artifact@v4 + with: + name: vad-test-wavs-cxx-${{ matrix.os }} + path: ./vad-test/*.wav + - name: Test Speech Enhancement (GTCRN) shell: bash run: | diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index 1337cf6a..0b3e6e75 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -53,6 +53,7 @@ def get_binaries(): "sherpa-onnx-microphone-offline-speaker-identification", "sherpa-onnx-offline", "sherpa-onnx-offline-audio-tagging", + "sherpa-onnx-offline-denoiser", "sherpa-onnx-offline-language-identification", "sherpa-onnx-offline-punctuation", "sherpa-onnx-offline-speaker-diarization", @@ -62,6 +63,7 @@ def get_binaries(): "sherpa-onnx-online-punctuation", "sherpa-onnx-online-websocket-client", "sherpa-onnx-online-websocket-server", + "sherpa-onnx-vad", "sherpa-onnx-vad-microphone", "sherpa-onnx-vad-microphone-offline-asr", "sherpa-onnx-vad-with-offline-asr", diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index a59d280c..45b01189 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -24,6 +24,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api) add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc) target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api) +add_executable(vad-cxx-api ./vad-cxx-api.cc) +target_link_libraries(vad-cxx-api sherpa-onnx-cxx-api) + if(SHERPA_ONNX_ENABLE_TTS) add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc) target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api) diff --git a/cxx-api-examples/vad-cxx-api.cc b/cxx-api-examples/vad-cxx-api.cc new file mode 100644 index 00000000..d9d96bb2 --- /dev/null +++ b/cxx-api-examples/vad-cxx-api.cc @@ -0,0 +1,85 @@ +// cxx-api-examples/vad-cxx-api.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use VAD to remove silences from a file +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +// +// clang-format on +#include +#include + +#include "sherpa-onnx/c-api/cxx-api.h" + +int32_t main() { + using namespace sherpa_onnx::cxx; // NOLINT + + std::string wave_filename = "./lei-jun-test.wav"; + std::string vad_filename = "./silero_vad.onnx"; + + VadModelConfig config; + config.silero_vad.model = vad_filename; + config.silero_vad.threshold = 0.1; + config.silero_vad.min_silence_duration = 0.5; + config.silero_vad.min_speech_duration = 0.25; + config.silero_vad.max_speech_duration = 20; + config.sample_rate = 16000; + config.debug = true; + + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); + if (!vad.Get()) { + std::cerr << "Failed to create VAD. Please check your config\n"; + return -1; + } + + Wave wave = ReadWave(wave_filename); + if (wave.samples.empty()) { + std::cerr << "Failed to read: '" << wave_filename << "'\n"; + return -1; + } + bool is_eof = false; + int32_t i = 0; + int32_t window_size = config.silero_vad.window_size; + + int32_t sample_rate = config.sample_rate; + + std::vector samples_without_silence; + + while (!is_eof) { + if (i + window_size < wave.samples.size()) { + vad.AcceptWaveform(wave.samples.data() + i, window_size); + i += window_size; + } else { + is_eof = true; + vad.Flush(); + } + + while (!vad.IsEmpty()) { + auto segment = vad.Front(); + float start_time = segment.start / static_cast(sample_rate); + float end_time = + start_time + segment.samples.size() / static_cast(sample_rate); + printf("%.3f -- %.3f\n", start_time, end_time); + + samples_without_silence.insert(samples_without_silence.end(), + segment.samples.begin(), + segment.samples.end()); + + vad.Pop(); + } + } + + bool ok = WriteWave("./lei-jun-test-no-silence.wav", + {samples_without_silence, sample_rate}); + if (ok) { + std::cout << "Saved to ./lei-jun-test-no-silence.wav\n"; + } else { + std::cerr << "Failed to write ./lei-jun-test-no-silence.wav\n"; + } + + return 0; +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 3cd11a79..027364b9 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig { // in seconds float min_speech_duration; - int window_size; + int32_t window_size; // If a speech segment is longer than this value, then we increase // the threshold to 0.9. After finishing detecting the segment, diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index a8ba1958..45d30fa8 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const { return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_); } +CircularBuffer CircularBuffer::Create(int32_t capacity) { + auto p = SherpaOnnxCreateCircularBuffer(capacity); + return CircularBuffer(p); +} + +CircularBuffer::CircularBuffer(const SherpaOnnxCircularBuffer *p) + : MoveOnly(p) {} + +void CircularBuffer::Destroy(const SherpaOnnxCircularBuffer *p) const { + SherpaOnnxDestroyCircularBuffer(p); +} + +void CircularBuffer::Push(const float *samples, int32_t n) const { + SherpaOnnxCircularBufferPush(p_, samples, n); +} + +std::vector CircularBuffer::Get(int32_t start_index, int32_t n) const { + const float *samples = SherpaOnnxCircularBufferGet(p_, start_index, n); + std::vector ans(n); + std::copy(samples, samples + n, ans.begin()); + + SherpaOnnxCircularBufferFree(samples); + return ans; +} + +void CircularBuffer::Pop(int32_t n) const { + SherpaOnnxCircularBufferPop(p_, n); +} + +int32_t CircularBuffer::Size() const { + return SherpaOnnxCircularBufferSize(p_); +} + +int32_t CircularBuffer::Head() const { + return SherpaOnnxCircularBufferHead(p_); +} + +void CircularBuffer::Reset() const { SherpaOnnxCircularBufferReset(p_); } + +VoiceActivityDetector VoiceActivityDetector::Create( + const VadModelConfig &config, float buffer_size_in_seconds) { + struct SherpaOnnxVadModelConfig c; + memset(&c, 0, sizeof(c)); + + c.silero_vad.model = config.silero_vad.model.c_str(); + c.silero_vad.threshold = config.silero_vad.threshold; + c.silero_vad.min_silence_duration = config.silero_vad.min_silence_duration; + c.silero_vad.min_speech_duration = config.silero_vad.min_speech_duration; + c.silero_vad.window_size = config.silero_vad.window_size; + c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration; + + c.sample_rate = config.sample_rate; + c.num_threads = config.num_threads; + c.provider = config.provider.c_str(); + c.debug = config.debug; + + auto p = SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds); + return VoiceActivityDetector(p); +} + +VoiceActivityDetector::VoiceActivityDetector( + const SherpaOnnxVoiceActivityDetector *p) + : MoveOnly(p) {} + +void VoiceActivityDetector::Destroy( + const SherpaOnnxVoiceActivityDetector *p) const { + SherpaOnnxDestroyVoiceActivityDetector(p); +} + +void VoiceActivityDetector::AcceptWaveform(const float *samples, + int32_t n) const { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(p_, samples, n); +} + +bool VoiceActivityDetector::IsEmpty() const { + return SherpaOnnxVoiceActivityDetectorEmpty(p_); +} + +bool VoiceActivityDetector ::IsDetected() const { + return SherpaOnnxVoiceActivityDetectorDetected(p_); +} + +void VoiceActivityDetector::Pop() const { + SherpaOnnxVoiceActivityDetectorPop(p_); +} + +void VoiceActivityDetector::Clear() const { + SherpaOnnxVoiceActivityDetectorClear(p_); +} + +SpeechSegment VoiceActivityDetector::Front() const { + auto f = SherpaOnnxVoiceActivityDetectorFront(p_); + + SpeechSegment segment; + segment.start = f->start; + segment.samples = std::vector{f->samples, f->samples + f->n}; + + SherpaOnnxDestroySpeechSegment(f); + + return segment; +} + +void VoiceActivityDetector::Reset() const { + SherpaOnnxVoiceActivityDetectorReset(p_); +} + +void VoiceActivityDetector::Flush() const { + SherpaOnnxVoiceActivityDetectorFlush(p_); +} + } // namespace sherpa_onnx::cxx diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index c3a3b096..94248996 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p); }; +// ============================== +// VAD +// ============================== + +struct SileroVadModelConfig { + std::string model; + float threshold = 0.5; + float min_silence_duration = 0.5; + float min_speech_duration = 0.25; + int32_t window_size = 512; + float max_speech_duration = 20; +}; + +struct VadModelConfig { + SileroVadModelConfig silero_vad; + + int32_t sample_rate = 16000; + int32_t num_threads = 1; + std::string provider = "cpu"; + bool debug = false; +}; + +struct SpeechSegment { + int32_t start; + std::vector samples; +}; + +class SHERPA_ONNX_API CircularBuffer + : public MoveOnly { + public: + static CircularBuffer Create(int32_t capacity); + + void Destroy(const SherpaOnnxCircularBuffer *p) const; + + void Push(const float *p, int32_t n) const; + + std::vector Get(int32_t start_index, int32_t n) const; + + void Pop(int32_t n) const; + + int32_t Size() const; + + int32_t Head() const; + + void Reset() const; + + private: + explicit CircularBuffer(const SherpaOnnxCircularBuffer *p); +}; + +class SHERPA_ONNX_API VoiceActivityDetector + : public MoveOnly { + public: + static VoiceActivityDetector Create(const VadModelConfig &config, + float buffer_size_in_seconds); + + void Destroy(const SherpaOnnxVoiceActivityDetector *p) const; + + void AcceptWaveform(const float *samples, int32_t n) const; + + bool IsEmpty() const; + + bool IsDetected() const; + + void Pop() const; + + void Clear() const; + + SpeechSegment Front() const; + + void Reset() const; + + void Flush() const; + + private: + explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p); +}; + } // namespace sherpa_onnx::cxx #endif // SHERPA_ONNX_C_API_CXX_API_H_ diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 073cce14..7de2d324 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -317,11 +317,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc) add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc) add_executable(sherpa-onnx-offline-audio-tagging sherpa-onnx-offline-audio-tagging.cc) + add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc) add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc) add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc) add_executable(sherpa-onnx-offline-punctuation sherpa-onnx-offline-punctuation.cc) add_executable(sherpa-onnx-online-punctuation sherpa-onnx-online-punctuation.cc) - add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc) + add_executable(sherpa-onnx-vad sherpa-onnx-vad.cc) if(SHERPA_ONNX_ENABLE_TTS) add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc) @@ -336,11 +337,12 @@ if(SHERPA_ONNX_ENABLE_BINARY) sherpa-onnx-keyword-spotter sherpa-onnx-offline sherpa-onnx-offline-audio-tagging + sherpa-onnx-offline-denoiser sherpa-onnx-offline-language-identification sherpa-onnx-offline-parallel sherpa-onnx-offline-punctuation - sherpa-onnx-offline-denoiser sherpa-onnx-online-punctuation + sherpa-onnx-vad ) if(SHERPA_ONNX_ENABLE_TTS) list(APPEND main_exes diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc index af24350b..0139a1a4 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-vad-alsa.cc @@ -7,9 +7,9 @@ #include #include +#include #include "sherpa-onnx/csrc/alsa.h" -#include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/voice-activity-detector.h" #include "sherpa-onnx/csrc/wave-writer.h" @@ -84,8 +84,6 @@ as the device_name. exit(-1); } - int32_t chunk = 0.1 * alsa.GetActualSampleRate(); - auto vad = std::make_unique(config); fprintf(stderr, "Started. Please speak\n"); @@ -95,36 +93,34 @@ as the device_name. int32_t k = 0; while (!stop) { - { - const std::vector &samples = alsa.Read(chunk); + const std::vector &samples = alsa.Read(window_size); - vad->AcceptWaveform(samples.data(), samples.size()); + vad->AcceptWaveform(samples.data(), samples.size()); - if (vad->IsSpeechDetected() && !printed) { - printed = true; - fprintf(stderr, "\nDetected speech!\n"); - } - if (!vad->IsSpeechDetected()) { - printed = false; - } + if (vad->IsSpeechDetected() && !printed) { + printed = true; + fprintf(stderr, "\nDetected speech!\n"); + } + if (!vad->IsSpeechDetected()) { + printed = false; + } - while (!vad->Empty()) { - const auto &segment = vad->Front(); - float duration = - segment.samples.size() / static_cast(sample_rate); + while (!vad->Empty()) { + const auto &segment = vad->Front(); + float duration = segment.samples.size() / static_cast(sample_rate); - fprintf(stderr, "Duration: %.3f seconds\n", duration); + fprintf(stderr, "Duration: %.3f seconds\n", duration); - char filename[128]; - snprintf(filename, sizeof(filename), "seg-%d-%.3fs.wav", k, duration); - k += 1; - sherpa_onnx::WriteWave(filename, 16000, segment.samples.data(), - segment.samples.size()); - fprintf(stderr, "Saved to %s\n", filename); - fprintf(stderr, "----------\n"); + std::ostringstream os; + os << "seg-" << k << "-" << std::fixed << std::setprecision(3) << duration + << "s.wav"; + k += 1; + sherpa_onnx::WriteWave(os.str(), 16000, segment.samples.data(), + segment.samples.size()); + fprintf(stderr, "Saved to %s\n", os.str().c_str()); + fprintf(stderr, "----------\n"); - vad->Pop(); - } + vad->Pop(); } } diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad.cc b/sherpa-onnx/csrc/sherpa-onnx-vad.cc new file mode 100644 index 00000000..c24e8076 --- /dev/null +++ b/sherpa-onnx/csrc/sherpa-onnx-vad.cc @@ -0,0 +1,110 @@ +// sherpa-onnx/csrc/sherpa-onnx-vad.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include +#include + +#include +#include + +#include "sherpa-onnx/csrc/voice-activity-detector.h" +#include "sherpa-onnx/csrc/wave-reader.h" +#include "sherpa-onnx/csrc/wave-writer.h" + +int32_t main(int32_t argc, char *argv[]) { + const char *kUsageMessage = R"usage( +This program shows how to use VAD in sherpa-onnx +to remove silences from a file. + + ./bin/sherpa-onnx-vad \ + --silero-vad-model=/path/to/silero_vad.onnx \ + /path/to/input.wav + /path/to/output.wav + +Please download silero_vad.onnx from +https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx + +For instance, use +wget https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx + +input.wav should be 16kHz. +)usage"; + + sherpa_onnx::ParseOptions po(kUsageMessage); + sherpa_onnx::VadModelConfig config; + + config.Register(&po); + po.Read(argc, argv); + if (po.NumArgs() != 2) { + fprintf( + stderr, + "Please provide only 2 argument2: the input wav and the output wav\n"); + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + fprintf(stderr, "%s\n", config.ToString().c_str()); + + if (!config.Validate()) { + fprintf(stderr, "Errors in config!\n"); + return -1; + } + + std::string wav_filename = po.GetArg(1); + int32_t sampling_rate = -1; + + bool is_ok = false; + std::vector samples = + sherpa_onnx::ReadWave(wav_filename, &sampling_rate, &is_ok); + + if (!is_ok) { + fprintf(stderr, "Failed to read '%s'\n", wav_filename.c_str()); + return -1; + } + + if (sampling_rate != 16000) { + fprintf(stderr, "Support only 16000Hz. Given: %d\n", sampling_rate); + return -1; + } + + auto vad = std::make_unique(config); + + int32_t window_size = config.silero_vad.window_size; + + int32_t i = 0; + bool is_eof = false; + + std::vector samples_without_silence; + + while (!is_eof) { + if (i + window_size < samples.size()) { + vad->AcceptWaveform(samples.data() + i, window_size); + i += window_size; + } else { + vad->Flush(); + is_eof = true; + } + + while (!vad->Empty()) { + const auto &segment = vad->Front(); + float start_time = segment.start / static_cast(sampling_rate); + float end_time = start_time + segment.samples.size() / + static_cast(sampling_rate); + + fprintf(stderr, "%.3f -- %.3f\n", start_time, end_time); + samples_without_silence.insert(samples_without_silence.end(), + segment.samples.begin(), + segment.samples.end()); + vad->Pop(); + } + } + + sherpa_onnx::WriteWave(po.GetArg(2), sampling_rate, + samples_without_silence.data(), + samples_without_silence.size()); + + fprintf(stderr, "Saved to %s\n", po.GetArg(2).c_str()); + + return 0; +}