diff --git a/.github/scripts/test-c-api.sh b/.github/scripts/test-c-api.sh index afc66c10..b29d1a0b 100755 --- a/.github/scripts/test-c-api.sh +++ b/.github/scripts/test-c-api.sh @@ -10,8 +10,21 @@ log() { echo "SLID_EXE is $SLID_EXE" echo "SID_EXE is $SID_EXE" +echo "AT_EXE is $AT_EXE" echo "PATH: $PATH" +log "------------------------------------------------------------" +log "Test audio tagging " +log "------------------------------------------------------------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 + +$AT_EXE + +rm -rf sherpa-onnx-zipformer-audio-tagging-2024-04-09 + log "------------------------------------------------------------" log "Download whisper tiny for spoken language identification " diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index ae0aec47..ee58cc40 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -126,6 +126,16 @@ jobs: name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} path: build/bin/* + - name: Test C API + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export SLID_EXE=spoken-language-identification-c-api + export SID_EXE=speaker-identification-c-api + export AT_EXE=audio-tagging-c-api + + .github/scripts/test-c-api.sh + - name: Test Audio tagging shell: bash run: | @@ -142,14 +152,6 @@ jobs: .github/scripts/test-online-ctc.sh - - name: Test C API - shell: bash - run: | - export PATH=$PWD/build/bin:$PATH - export SLID_EXE=spoken-language-identification-c-api - export SID_EXE=speaker-identification-c-api - - .github/scripts/test-c-api.sh - name: Test spoken language identification (C++ API) shell: bash diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 9dfcb7c9..99b4f301 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -105,6 +105,16 @@ jobs: otool -L build/bin/sherpa-onnx otool -l build/bin/sherpa-onnx + - name: Test C API + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export SLID_EXE=spoken-language-identification-c-api + export SID_EXE=speaker-identification-c-api + export AT_EXE=audio-tagging-c-api + + .github/scripts/test-c-api.sh + - name: Test Audio tagging shell: bash run: | @@ -113,15 +123,6 @@ jobs: .github/scripts/test-audio-tagging.sh - - name: Test C API - shell: bash - run: | - export PATH=$PWD/build/bin:$PATH - export SLID_EXE=spoken-language-identification-c-api - export SID_EXE=speaker-identification-c-api - - .github/scripts/test-c-api.sh - - name: Test spoken language identification (C++ API) shell: bash run: | diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml index 8f171559..cf000be8 100644 --- a/.github/workflows/windows-x64.yaml +++ b/.github/workflows/windows-x64.yaml @@ -72,6 +72,17 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe + - name: Test C API + shell: bash + run: | + export PATH=$PWD/build/bin/Release:$PATH + export SLID_EXE=spoken-language-identification-c-api.exe + export SID_EXE=speaker-identification-c-api.exe + export AT_EXE=audio-tagging-c-api.exe + + .github/scripts/test-c-api.sh + + - name: Test Audio tagging shell: bash run: | @@ -80,15 +91,6 @@ jobs: .github/scripts/test-audio-tagging.sh - - name: Test C API - shell: bash - run: | - export PATH=$PWD/build/bin/Release:$PATH - export SLID_EXE=spoken-language-identification-c-api.exe - export SID_EXE=speaker-identification-c-api.exe - - .github/scripts/test-c-api.sh - - name: Test spoken language identification (C++ API) shell: bash run: | diff --git a/.github/workflows/windows-x86.yaml b/.github/workflows/windows-x86.yaml index 65d1bea6..7a18e0be 100644 --- a/.github/workflows/windows-x86.yaml +++ b/.github/workflows/windows-x86.yaml @@ -77,6 +77,8 @@ jobs: run: | export PATH=$PWD/build/bin/Release:$PATH export SLID_EXE=spoken-language-identification-c-api.exe + export SID_EXE=speaker-identification-c-api.exe + export AT_EXE=audio-tagging-c-api.exe .github/scripts/test-c-api.sh diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt index 4c3669d1..8d9bfe98 100644 --- a/c-api-examples/CMakeLists.txt +++ b/c-api-examples/CMakeLists.txt @@ -18,6 +18,9 @@ target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api) add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c) target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api) +add_executable(audio-tagging-c-api audio-tagging-c-api.c) +target_link_libraries(audio-tagging-c-api sherpa-onnx-c-api) + if(SHERPA_ONNX_HAS_ALSA) add_subdirectory(./asr-microphone-example) elseif((UNIX AND NOT APPLE) OR LINUX) diff --git a/c-api-examples/audio-tagging-c-api.c b/c-api-examples/audio-tagging-c-api.c new file mode 100644 index 00000000..1272717a --- /dev/null +++ b/c-api-examples/audio-tagging-c-api.c @@ -0,0 +1,79 @@ +// c-api-examples/audio-tagging-c-api.c +// +// Copyright (c) 2024 Xiaomi Corporation + +// We assume you have pre-downloaded the model files for testing +// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models +// +// An example is given below: +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +// tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +// rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +int32_t main() { + SherpaOnnxAudioTaggingConfig config; + memset(&config, 0, sizeof(config)); + + config.model.zipformer.model = + "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx"; + config.model.num_threads = 1; + config.model.debug = 1; + config.model.provider = "cpu"; + // clang-format off + config.labels = "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv"; + // clang-format on + + const SherpaOnnxAudioTagging *tagger = SherpaOnnxCreateAudioTagging(&config); + if (!tagger) { + fprintf(stderr, "Failed to create audio tagger. Please check your config"); + return -1; + } + + // You can find more test waves from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2 + const char *wav_filename = + "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav"; + + const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename); + if (wave == NULL) { + fprintf(stderr, "Failed to read %s\n", wav_filename); + return -1; + } + + const SherpaOnnxOfflineStream *stream = + SherpaOnnxAudioTaggingCreateOfflineStream(tagger); + + AcceptWaveformOffline(stream, wave->sample_rate, wave->samples, + wave->num_samples); + + int32_t top_k = 5; + const SherpaOnnxAudioEvent *const *results = + SherpaOnnxAudioTaggingCompute(tagger, stream, top_k); + + fprintf(stderr, "--------------------------------------------------\n"); + fprintf(stderr, "Index\t\tProbability\t\tEvent name\n"); + fprintf(stderr, "--------------------------------------------------\n"); + for (int32_t i = 0; i != top_k; ++i) { + fprintf(stderr, "%d\t\t%.3f\t\t\t%s\n", i, results[i]->prob, + results[i]->name); + } + fprintf(stderr, "--------------------------------------------------\n"); + + SherpaOnnxAudioTaggingFreeResults(results); + DestroyOfflineStream(stream); + SherpaOnnxFreeWave(wave); + SherpaOnnxDestroyAudioTagging(tagger); + + return 0; +}; diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index c349dd3f..995817a0 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -10,6 +10,7 @@ #include #include +#include "sherpa-onnx/csrc/audio-tagging.h" #include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/display.h" #include "sherpa-onnx/csrc/keyword-spotter.h" @@ -400,15 +401,18 @@ SherpaOnnxOfflineStream *CreateOfflineStream( return stream; } -void DestroyOfflineStream(SherpaOnnxOfflineStream *stream) { delete stream; } +void DestroyOfflineStream(const SherpaOnnxOfflineStream *stream) { + delete stream; +} -void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream, int32_t sample_rate, - const float *samples, int32_t n) { +void AcceptWaveformOffline(const SherpaOnnxOfflineStream *stream, + int32_t sample_rate, const float *samples, + int32_t n) { stream->impl->AcceptWaveform(sample_rate, samples, n); } -void DecodeOfflineStream(SherpaOnnxOfflineRecognizer *recognizer, - SherpaOnnxOfflineStream *stream) { +void DecodeOfflineStream(const SherpaOnnxOfflineRecognizer *recognizer, + const SherpaOnnxOfflineStream *stream) { recognizer->impl->DecodeStream(stream->impl.get()); } @@ -1209,3 +1213,89 @@ void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers( delete[] names; } + +struct SherpaOnnxAudioTagging { + std::unique_ptr impl; +}; + +const SherpaOnnxAudioTagging *SherpaOnnxCreateAudioTagging( + const SherpaOnnxAudioTaggingConfig *config) { + sherpa_onnx::AudioTaggingConfig ac; + ac.model.zipformer.model = SHERPA_ONNX_OR(config->model.zipformer.model, ""); + ac.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); + ac.model.debug = config->model.debug; + ac.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); + ac.labels = SHERPA_ONNX_OR(config->labels, ""); + ac.top_k = SHERPA_ONNX_OR(config->top_k, 5); + + if (ac.model.debug) { + SHERPA_ONNX_LOGE("%s\n", ac.ToString().c_str()); + } + + if (!ac.Validate()) { + SHERPA_ONNX_LOGE("Errors in config"); + return nullptr; + } + + SherpaOnnxAudioTagging *tagger = new SherpaOnnxAudioTagging; + tagger->impl = std::make_unique(ac); + + return tagger; +} + +void SherpaOnnxDestroyAudioTagging(const SherpaOnnxAudioTagging *tagger) { + delete tagger; +} + +const SherpaOnnxOfflineStream *SherpaOnnxAudioTaggingCreateOfflineStream( + const SherpaOnnxAudioTagging *tagger) { + const SherpaOnnxOfflineStream *stream = + new SherpaOnnxOfflineStream(tagger->impl->CreateStream()); + return stream; +} + +const SherpaOnnxAudioEvent *const *SherpaOnnxAudioTaggingCompute( + const SherpaOnnxAudioTagging *tagger, const SherpaOnnxOfflineStream *s, + int32_t top_k) { + std::vector events = + tagger->impl->Compute(s->impl.get(), top_k); + + int32_t n = static_cast(events.size()); + SherpaOnnxAudioEvent **ans = new SherpaOnnxAudioEvent *[n + 1]; + ans[n] = nullptr; + + int32_t i = 0; + for (const auto &e : events) { + SherpaOnnxAudioEvent *p = new SherpaOnnxAudioEvent; + + char *name = new char[e.name.size() + 1]; + std::copy(e.name.begin(), e.name.end(), name); + name[e.name.size()] = 0; + + p->name = name; + + p->index = e.index; + p->prob = e.prob; + + ans[i] = p; + i += 1; + } + + return ans; +} + +void SherpaOnnxAudioTaggingFreeResults( + const SherpaOnnxAudioEvent *const *events) { + auto p = events; + + while (p && *p) { + auto e = *p; + + delete[] e->name; + delete e; + + ++p; + } + + delete[] events; +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 276b3590..3833209a 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -427,7 +427,8 @@ SHERPA_ONNX_API SherpaOnnxOfflineStream *CreateOfflineStream( /// Destroy an offline stream. /// /// @param stream A pointer returned by CreateOfflineStream() -SHERPA_ONNX_API void DestroyOfflineStream(SherpaOnnxOfflineStream *stream); +SHERPA_ONNX_API void DestroyOfflineStream( + const SherpaOnnxOfflineStream *stream); /// Accept input audio samples and compute the features. /// The user has to invoke DecodeOfflineStream() to run the neural network and @@ -442,9 +443,9 @@ SHERPA_ONNX_API void DestroyOfflineStream(SherpaOnnxOfflineStream *stream); /// @param n Number of elements in the samples array. /// /// @caution: For each offline stream, please invoke this function only once! -SHERPA_ONNX_API void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream, - int32_t sample_rate, - const float *samples, int32_t n); +SHERPA_ONNX_API void AcceptWaveformOffline( + const SherpaOnnxOfflineStream *stream, int32_t sample_rate, + const float *samples, int32_t n); /// Decode an offline stream. /// /// We assume you have invoked AcceptWaveformOffline() for the given stream @@ -453,7 +454,8 @@ SHERPA_ONNX_API void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream, /// @param recognizer A pointer returned by CreateOfflineRecognizer(). /// @param stream A pointer returned by CreateOfflineStream() SHERPA_ONNX_API void DecodeOfflineStream( - SherpaOnnxOfflineRecognizer *recognizer, SherpaOnnxOfflineStream *stream); + const SherpaOnnxOfflineRecognizer *recognizer, + const SherpaOnnxOfflineStream *stream); /// Decode a list offline streams in parallel. /// @@ -1088,6 +1090,65 @@ SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers( SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers( const char *const *names); +// ============================================================ +// For audio tagging +// ============================================================ +SHERPA_ONNX_API typedef struct + SherpaOnnxOfflineZipformerAudioTaggingModelConfig { + const char *model; +} SherpaOnnxOfflineZipformerAudioTaggingModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxAudioTaggingModelConfig { + SherpaOnnxOfflineZipformerAudioTaggingModelConfig zipformer; + int32_t num_threads; + int32_t debug; // true to print debug information of the model + const char *provider; +} SherpaOnnxAudioTaggingModelConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxAudioTaggingConfig { + SherpaOnnxAudioTaggingModelConfig model; + const char *labels; + int32_t top_k; +} SherpaOnnxAudioTaggingConfig; + +SHERPA_ONNX_API typedef struct SherpaOnnxAudioEvent { + const char *name; + int32_t index; + float prob; +} SherpaOnnxAudioEvent; + +SHERPA_ONNX_API typedef struct SherpaOnnxAudioTagging SherpaOnnxAudioTagging; + +// The user has to invoke +// SherpaOnnxDestroyAudioTagging() +// to free the returned pointer to avoid memory leak +SHERPA_ONNX_API const SherpaOnnxAudioTagging *SherpaOnnxCreateAudioTagging( + const SherpaOnnxAudioTaggingConfig *config); + +SHERPA_ONNX_API void SherpaOnnxDestroyAudioTagging( + const SherpaOnnxAudioTagging *tagger); + +// The user has to invoke DestroyOfflineStream() +// to free the returned pointer to avoid memory leak +SHERPA_ONNX_API const SherpaOnnxOfflineStream * +SherpaOnnxAudioTaggingCreateOfflineStream(const SherpaOnnxAudioTagging *tagger); + +// Return an array of pointers. The length of the array is top_k + 1. +// If top_k is -1, then config.top_k is used, where config is the config +// used to create the input tagger. +// +// The ans[0]->prob has the largest probability among the array elements +// The last element of the array is a null pointer +// +// The user has to use SherpaOnnxAudioTaggingFreeResults() +// to free the returned pointer to avoid memory leak +SHERPA_ONNX_API const SherpaOnnxAudioEvent *const * +SherpaOnnxAudioTaggingCompute(const SherpaOnnxAudioTagging *tagger, + const SherpaOnnxOfflineStream *s, int32_t top_k); + +SHERPA_ONNX_API void SherpaOnnxAudioTaggingFreeResults( + const SherpaOnnxAudioEvent *const *p); + #if defined(__GNUC__) #pragma GCC diagnostic pop #endif