diff --git a/.github/workflows/c-api.yaml b/.github/workflows/c-api.yaml index ad050373..589bda71 100644 --- a/.github/workflows/c-api.yaml +++ b/.github/workflows/c-api.yaml @@ -99,6 +99,45 @@ jobs: ./run.sh rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 + - name: Test vad + sense-voice + shell: bash + run: | + gcc -o vad-sense-voice-c-api ./c-api-examples/vad-sense-voice-c-api.c \ + -I ./build/install/include \ + -L ./build/install/lib/ \ + -l sherpa-onnx-c-api \ + -l onnxruntime + + ls -lh vad-sense-voice-c-api + + if [[ ${{ matrix.os }} == ubuntu-latest ]]; then + ldd ./vad-sense-voice-c-api + echo "----" + readelf -d ./vad-sense-voice-c-api + fi + + # Now download models + # + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + echo "---" + ls -lh sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs + + export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH + export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH + + ./vad-sense-voice-c-api + + rm -rf sherpa-onnx-sense-voice-* + rm -rf *.onnx + rm *.wav + - name: Test sense-voice shell: bash run: | diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 20951b96..49fb8fad 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -45,6 +45,9 @@ target_link_libraries(streaming-paraformer-c-api sherpa-onnx-c-api) add_executable(telespeech-c-api telespeech-c-api.c) target_link_libraries(telespeech-c-api sherpa-onnx-c-api) +add_executable(vad-sense-voice-c-api vad-sense-voice-c-api.c) +target_link_libraries(vad-sense-voice-c-api sherpa-onnx-c-api) + if(SHERPA_ONNX_HAS_ALSA) add_subdirectory(./asr-microphone-example) elseif((UNIX AND NOT APPLE) OR LINUX) diff --git a/c-api-examples/paraformer-c-api.c b/c-api-examples/paraformer-c-api.c index 41b9df4b..345aed55 100644 --- a/c-api-examples/paraformer-c-api.c +++ b/c-api-examples/paraformer-c-api.c @@ -3,7 +3,8 @@ // Copyright (c) 2024 Xiaomi Corporation // -// This file demonstrates how to use non-streaming Paraformer with sherpa-onnx's C API. +// This file demonstrates how to use non-streaming Paraformer with sherpa-onnx's +// C API. // clang-format off // // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-small-2024-03-09.tar.bz2 @@ -19,19 +20,20 @@ #include "sherpa-onnx/c-api/c-api.h" int32_t main() { - - const char *wav_filename = "sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/0.wav"; - const char *model_filename = "sherpa-onnx-paraformer-zh-small-2024-03-09/model.int8.onnx"; - const char *tokens_filename = "sherpa-onnx-paraformer-zh-small-2024-03-09/tokens.txt"; + const char *wav_filename = + "sherpa-onnx-paraformer-zh-small-2024-03-09/test_wavs/0.wav"; + const char *model_filename = + "sherpa-onnx-paraformer-zh-small-2024-03-09/model.int8.onnx"; + const char *tokens_filename = + "sherpa-onnx-paraformer-zh-small-2024-03-09/tokens.txt"; const char *provider = "cpu"; - const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); if (wave == NULL) { fprintf(stderr, "Failed to read %s\n", wav_filename); return -1; } - + // Paraformer config SherpaOnnxOfflineParaformerModelConfig paraformer_config; memset(¶former_config, 0, sizeof(paraformer_config)); diff --git a/c-api-examples/sense-voice-c-api.c b/c-api-examples/sense-voice-c-api.c index fdb18694..06e89063 100644 --- a/c-api-examples/sense-voice-c-api.c +++ b/c-api-examples/sense-voice-c-api.c @@ -19,8 +19,6 @@ #include "sherpa-onnx/c-api/c-api.h" int32_t main() { - // You can find more test waves from - // https://hf-mirror.com/spaces/k2-fsa/spoken-language-identification/tree/main/test_wavs const char *wav_filename = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav"; const char *model_filename = diff --git a/c-api-examples/streaming-paraformer-c-api.c b/c-api-examples/streaming-paraformer-c-api.c index 88c68cc8..b54116f0 100644 --- a/c-api-examples/streaming-paraformer-c-api.c +++ b/c-api-examples/streaming-paraformer-c-api.c @@ -3,7 +3,8 @@ // Copyright (c) 2024 Xiaomi Corporation // -// This file demonstrates how to use streaming Paraformer with sherpa-onnx's C API. +// This file demonstrates how to use streaming Paraformer with sherpa-onnx's C +// API. // clang-format off // // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 @@ -19,26 +20,27 @@ #include "sherpa-onnx/c-api/c-api.h" int32_t main() { - - const char *wav_filename = "sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav"; - const char *encoder_filename = "sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx"; - const char *decoder_filename = "sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"; - const char *tokens_filename = "sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt"; + const char *wav_filename = + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav"; + const char *encoder_filename = + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx"; + const char *decoder_filename = + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"; + const char *tokens_filename = + "sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt"; const char *provider = "cpu"; - const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); if (wave == NULL) { fprintf(stderr, "Failed to read %s\n", wav_filename); return -1; } - + // Paraformer config SherpaOnnxOnlineParaformerModelConfig paraformer_config; memset(¶former_config, 0, sizeof(paraformer_config)); paraformer_config.encoder = encoder_filename; paraformer_config.decoder = decoder_filename; - // Online model config SherpaOnnxOnlineModelConfig online_model_config; diff --git a/c-api-examples/streaming-zipformer-c-api.c b/c-api-examples/streaming-zipformer-c-api.c index 61fb65fc..e1417639 100644 --- a/c-api-examples/streaming-zipformer-c-api.c +++ b/c-api-examples/streaming-zipformer-c-api.c @@ -3,7 +3,8 @@ // Copyright (c) 2024 Xiaomi Corporation // -// This file demonstrates how to use streaming Zipformer with sherpa-onnx's C API. +// This file demonstrates how to use streaming Zipformer with sherpa-onnx's C +// API. // clang-format off // // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2 @@ -19,28 +20,33 @@ #include "sherpa-onnx/c-api/c-api.h" int32_t main() { - - const char *wav_filename = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav"; - const char *encoder_filename = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/encoder-epoch-99-avg-1.onnx"; - const char *decoder_filename = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/decoder-epoch-99-avg-1.onnx"; - const char *joiner_filename = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/joiner-epoch-99-avg-1.onnx"; - const char *tokens_filename = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt"; + const char *wav_filename = + "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/test_wavs/0.wav"; + const char *encoder_filename = + "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/" + "encoder-epoch-99-avg-1.onnx"; + const char *decoder_filename = + "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/" + "decoder-epoch-99-avg-1.onnx"; + const char *joiner_filename = + "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/" + "joiner-epoch-99-avg-1.onnx"; + const char *tokens_filename = + "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17/tokens.txt"; const char *provider = "cpu"; - const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); if (wave == NULL) { fprintf(stderr, "Failed to read %s\n", wav_filename); return -1; } - + // Zipformer config SherpaOnnxOnlineTransducerModelConfig zipformer_config; memset(&zipformer_config, 0, sizeof(zipformer_config)); zipformer_config.encoder = encoder_filename; zipformer_config.decoder = decoder_filename; zipformer_config.joiner = joiner_filename; - // Online model config SherpaOnnxOnlineModelConfig online_model_config; diff --git a/c-api-examples/telespeech-c-api.c b/c-api-examples/telespeech-c-api.c index b3ae74b5..fa7824c3 100644 --- a/c-api-examples/telespeech-c-api.c +++ b/c-api-examples/telespeech-c-api.c @@ -3,7 +3,8 @@ // Copyright (c) 2024 Xiaomi Corporation // -// This file demonstrates how to use TeleSpeech-ASR CTC model with sherpa-onnx's C API. +// This file demonstrates how to use TeleSpeech-ASR CTC model with sherpa-onnx's +// C API. // clang-format off // // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 @@ -19,10 +20,12 @@ #include "sherpa-onnx/c-api/c-api.h" int32_t main() { - - const char *wav_filename = "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav"; - const char *model_filename = "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx"; - const char *tokens_filename = "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt"; + const char *wav_filename = + "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav"; + const char *model_filename = + "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx"; + const char *tokens_filename = + "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt"; const char *provider = "cpu"; const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); diff --git a/c-api-examples/vad-sense-voice-c-api.c b/c-api-examples/vad-sense-voice-c-api.c new file mode 100644 index 00000000..172ec0a7 --- /dev/null +++ b/c-api-examples/vad-sense-voice-c-api.c @@ -0,0 +1,168 @@ +// c-api-examples/vad-sense-voice-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// +// This file demonstrates how to use VAD + SenseVoice with sherpa-onnx's C API. +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + const char *wav_filename = "./lei-jun-test.wav"; + const char *vad_filename = "./silero_vad.onnx"; + const char *model_filename = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"; + const char *tokens_filename = + "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"; + const char *language = "auto"; + const char *provider = "cpu"; + int32_t use_inverse_text_normalization = 1; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + if (wave->sample_rate != 16000) { + fprintf(stderr, "Expect the sample rate to be 16000. Given: %d\n", + wave->sample_rate); + SherpaOnnxFreeWave(wave); + return -1; + } + + SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config; + memset(&sense_voice_config, 0, sizeof(sense_voice_config)); + sense_voice_config.model = model_filename; + sense_voice_config.language = language; + sense_voice_config.use_itn = use_inverse_text_normalization; + + // Offline model config + SherpaOnnxOfflineModelConfig offline_model_config; + memset(&offline_model_config, 0, sizeof(offline_model_config)); + offline_model_config.debug = 0; + offline_model_config.num_threads = 1; + offline_model_config.provider = provider; + offline_model_config.tokens = tokens_filename; + offline_model_config.sense_voice = sense_voice_config; + + // Recognizer config + SherpaOnnxOfflineRecognizerConfig recognizer_config; + memset(&recognizer_config, 0, sizeof(recognizer_config)); + recognizer_config.decoding_method = "greedy_search"; + recognizer_config.model_config = offline_model_config; + + SherpaOnnxOfflineRecognizer *recognizer = + SherpaOnnxCreateOfflineRecognizer(&recognizer_config); + + if (recognizer == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + return -1; + } + + SherpaOnnxVadModelConfig vadConfig; + memset(&vadConfig, 0, sizeof(vadConfig)); + vadConfig.silero_vad.model = vad_filename; + vadConfig.silero_vad.threshold = 0.5; + vadConfig.silero_vad.min_silence_duration = 0.5; + vadConfig.silero_vad.min_speech_duration = 0.5; + vadConfig.silero_vad.window_size = 512; + vadConfig.sample_rate = 16000; + vadConfig.num_threads = 1; + vadConfig.debug = 1; + + SherpaOnnxVoiceActivityDetector *vad = + SherpaOnnxCreateVoiceActivityDetector(&vadConfig, 30); + + if (vad == NULL) { + fprintf(stderr, "Please check your recognizer config!\n"); + SherpaOnnxFreeWave(wave); + SherpaOnnxDestroyOfflineRecognizer(recognizer); + return -1; + } + + int32_t window_size = vadConfig.silero_vad.window_size; + int32_t i = 0; + + while (i + window_size < wave->num_samples) { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, wave->samples + i, + window_size); + i += window_size; + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + SherpaOnnxOfflineStream *stream = + SherpaOnnxCreateOfflineStream(recognizer); + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, + segment->samples, segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + } + + SherpaOnnxVoiceActivityDetectorFlush(vad); + + while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) { + const SherpaOnnxSpeechSegment *segment = + SherpaOnnxVoiceActivityDetectorFront(vad); + + SherpaOnnxOfflineStream *stream = SherpaOnnxCreateOfflineStream(recognizer); + SherpaOnnxAcceptWaveformOffline(stream, wave->sample_rate, segment->samples, + segment->n); + + SherpaOnnxDecodeOfflineStream(recognizer, stream); + + const SherpaOnnxOfflineRecognizerResult *result = + SherpaOnnxGetOfflineStreamResult(stream); + + float start = segment->start / 16000.0f; + float duration = segment->n / 16000.0f; + float stop = start + duration; + + fprintf(stderr, "%.3f -- %.3f: %s\n", start, stop, result->text); + + SherpaOnnxDestroyOfflineRecognizerResult(result); + SherpaOnnxDestroyOfflineStream(stream); + + SherpaOnnxDestroySpeechSegment(segment); + SherpaOnnxVoiceActivityDetectorPop(vad); + } + + SherpaOnnxDestroyOfflineRecognizer(recognizer); + SherpaOnnxDestroyVoiceActivityDetector(vad); + SherpaOnnxFreeWave(wave); + + return 0; +} diff --git a/c-api-examples/zipformer-c-api.c b/c-api-examples/zipformer-c-api.c index 311d40d3..35393b19 100644 --- a/c-api-examples/zipformer-c-api.c +++ b/c-api-examples/zipformer-c-api.c @@ -3,7 +3,8 @@ // Copyright (c) 2024 Xiaomi Corporation // -// This file demonstrates how to use non-streaming Zipformer with sherpa-onnx's C API. +// This file demonstrates how to use non-streaming Zipformer with sherpa-onnx's +// C API. // clang-format off // // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-small-en-2023-06-26.tar.bz2 @@ -19,28 +20,30 @@ #include "sherpa-onnx/c-api/c-api.h" int32_t main() { - - const char *wav_filename = "sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav"; - const char *encoder_filename = "sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.onnx"; - const char *decoder_filename = "sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx"; - const char *joiner_filename = "sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.onnx"; - const char *tokens_filename = "sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt"; + const char *wav_filename = + "sherpa-onnx-zipformer-small-en-2023-06-26/test_wavs/0.wav"; + const char *encoder_filename = + "sherpa-onnx-zipformer-small-en-2023-06-26/encoder-epoch-99-avg-1.onnx"; + const char *decoder_filename = + "sherpa-onnx-zipformer-small-en-2023-06-26/decoder-epoch-99-avg-1.onnx"; + const char *joiner_filename = + "sherpa-onnx-zipformer-small-en-2023-06-26/joiner-epoch-99-avg-1.onnx"; + const char *tokens_filename = + "sherpa-onnx-zipformer-small-en-2023-06-26/tokens.txt"; const char *provider = "cpu"; - const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); if (wave == NULL) { fprintf(stderr, "Failed to read %s\n", wav_filename); return -1; } - + // Zipformer config SherpaOnnxOfflineTransducerModelConfig zipformer_config; memset(&zipformer_config, 0, sizeof(zipformer_config)); zipformer_config.encoder = encoder_filename; zipformer_config.decoder = decoder_filename; zipformer_config.joiner = joiner_filename; - // Offline model config SherpaOnnxOfflineModelConfig offline_model_config;