Add C API for audio tagging (#754)

2024-04-11 14:18:43 +08:00
parent 34d70a259f
commit f204e62b44
9 changed files with 289 additions and 36 deletions
--- a/.github/scripts/test-c-api.sh
+++ b/.github/scripts/test-c-api.sh
@@ -10,8 +10,21 @@ log() {
 echo "SLID_EXE is $SLID_EXE"
 echo "SID_EXE is $SID_EXE"
 echo "AT_EXE is $AT_EXE"
 echo "PATH: $PATH"
 log "------------------------------------------------------------"
 log "Test audio tagging                                          "
 log "------------------------------------------------------------"
 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
 tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
 rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
 $AT_EXE
 rm -rf sherpa-onnx-zipformer-audio-tagging-2024-04-09
 log "------------------------------------------------------------"
 log "Download whisper tiny for spoken language identification    "
--- a/.github/workflows/linux.yaml
+++ b/.github/workflows/linux.yaml
@@ -126,6 +126,16 @@ jobs:
          name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
          path: build/bin/*
      - name: Test C API
        shell: bash
        run: |
          export PATH=$PWD/build/bin:$PATH
          export SLID_EXE=spoken-language-identification-c-api
          export SID_EXE=speaker-identification-c-api
          export AT_EXE=audio-tagging-c-api
          .github/scripts/test-c-api.sh
      - name: Test Audio tagging
        shell: bash
        run: |
@@ -142,14 +152,6 @@ jobs:
          .github/scripts/test-online-ctc.sh
      - name: Test C API
        shell: bash
        run: |
          export PATH=$PWD/build/bin:$PATH
          export SLID_EXE=spoken-language-identification-c-api
          export SID_EXE=speaker-identification-c-api
          .github/scripts/test-c-api.sh
      - name: Test spoken language identification (C++ API)
        shell: bash
--- a/.github/workflows/macos.yaml
+++ b/.github/workflows/macos.yaml
@@ -105,6 +105,16 @@ jobs:
          otool -L build/bin/sherpa-onnx
          otool -l build/bin/sherpa-onnx
      - name: Test C API
        shell: bash
        run: |
          export PATH=$PWD/build/bin:$PATH
          export SLID_EXE=spoken-language-identification-c-api
          export SID_EXE=speaker-identification-c-api
          export AT_EXE=audio-tagging-c-api
          .github/scripts/test-c-api.sh
      - name: Test Audio tagging
        shell: bash
        run: |
@@ -113,15 +123,6 @@ jobs:
          .github/scripts/test-audio-tagging.sh
      - name: Test C API
        shell: bash
        run: |
          export PATH=$PWD/build/bin:$PATH
          export SLID_EXE=spoken-language-identification-c-api
          export SID_EXE=speaker-identification-c-api
          .github/scripts/test-c-api.sh
      - name: Test spoken language identification (C++ API)
        shell: bash
        run: |
--- a/.github/workflows/windows-x64.yaml
+++ b/.github/workflows/windows-x64.yaml
@@ -72,6 +72,17 @@ jobs:
          ls -lh ./bin/Release/sherpa-onnx.exe
      - name: Test C API
        shell: bash
        run: |
          export PATH=$PWD/build/bin/Release:$PATH
          export SLID_EXE=spoken-language-identification-c-api.exe
          export SID_EXE=speaker-identification-c-api.exe
          export AT_EXE=audio-tagging-c-api.exe
          .github/scripts/test-c-api.sh
      - name: Test Audio tagging
        shell: bash
        run: |
@@ -80,15 +91,6 @@ jobs:
          .github/scripts/test-audio-tagging.sh
      - name: Test C API
        shell: bash
        run: |
          export PATH=$PWD/build/bin/Release:$PATH
          export SLID_EXE=spoken-language-identification-c-api.exe
          export SID_EXE=speaker-identification-c-api.exe
          .github/scripts/test-c-api.sh
      - name: Test spoken language identification (C++ API)
        shell: bash
        run: |
--- a/.github/workflows/windows-x86.yaml
+++ b/.github/workflows/windows-x86.yaml
@@ -77,6 +77,8 @@ jobs:
        run: |
          export PATH=$PWD/build/bin/Release:$PATH
          export SLID_EXE=spoken-language-identification-c-api.exe
          export SID_EXE=speaker-identification-c-api.exe
          export AT_EXE=audio-tagging-c-api.exe
          .github/scripts/test-c-api.sh
--- a/c-api-examples/CMakeLists.txt
+++ b/c-api-examples/CMakeLists.txt
@@ -18,6 +18,9 @@ target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)
 add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c)
 target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api)
 add_executable(audio-tagging-c-api audio-tagging-c-api.c)
 target_link_libraries(audio-tagging-c-api sherpa-onnx-c-api)
 if(SHERPA_ONNX_HAS_ALSA)
  add_subdirectory(./asr-microphone-example)
 elseif((UNIX AND NOT APPLE) OR LINUX)
--- a/c-api-examples/audio-tagging-c-api.c
+++ b/c-api-examples/audio-tagging-c-api.c
@@ -0,0 +1,79 @@
 // c-api-examples/audio-tagging-c-api.c
 //
 // Copyright (c)  2024  Xiaomi Corporation
 // We assume you have pre-downloaded the model files for testing
 // from https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
 //
 // An example is given below:
 //
 // clang-format off
 //
 // wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
 // tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
 // rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
 //
 // clang-format on
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "sherpa-onnx/c-api/c-api.h"
 int32_t main() {
  SherpaOnnxAudioTaggingConfig config;
  memset(&config, 0, sizeof(config));
  config.model.zipformer.model =
      "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx";
  config.model.num_threads = 1;
  config.model.debug = 1;
  config.model.provider = "cpu";
  // clang-format off
  config.labels = "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv";
  // clang-format on
  const SherpaOnnxAudioTagging *tagger = SherpaOnnxCreateAudioTagging(&config);
  if (!tagger) {
    fprintf(stderr, "Failed to create audio tagger. Please check your config");
    return -1;
  }
  // You can find more test waves from
  // https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
  const char *wav_filename =
      "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav";
  const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
  if (wave == NULL) {
    fprintf(stderr, "Failed to read %s\n", wav_filename);
    return -1;
  }
  const SherpaOnnxOfflineStream *stream =
      SherpaOnnxAudioTaggingCreateOfflineStream(tagger);
  AcceptWaveformOffline(stream, wave->sample_rate, wave->samples,
                        wave->num_samples);
  int32_t top_k = 5;
  const SherpaOnnxAudioEvent *const *results =
      SherpaOnnxAudioTaggingCompute(tagger, stream, top_k);
  fprintf(stderr, "--------------------------------------------------\n");
  fprintf(stderr, "Index\t\tProbability\t\tEvent name\n");
  fprintf(stderr, "--------------------------------------------------\n");
  for (int32_t i = 0; i != top_k; ++i) {
    fprintf(stderr, "%d\t\t%.3f\t\t\t%s\n", i, results[i]->prob,
            results[i]->name);
  }
  fprintf(stderr, "--------------------------------------------------\n");
  SherpaOnnxAudioTaggingFreeResults(results);
  DestroyOfflineStream(stream);
  SherpaOnnxFreeWave(wave);
  SherpaOnnxDestroyAudioTagging(tagger);
  return 0;
 };
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -10,6 +10,7 @@
 #include <utility>
 #include <vector>
 #include "sherpa-onnx/csrc/audio-tagging.h"
 #include "sherpa-onnx/csrc/circular-buffer.h"
 #include "sherpa-onnx/csrc/display.h"
 #include "sherpa-onnx/csrc/keyword-spotter.h"
@@ -400,15 +401,18 @@ SherpaOnnxOfflineStream *CreateOfflineStream(
  return stream;
 }
-void DestroyOfflineStream(SherpaOnnxOfflineStream *stream) { delete stream; }
+void DestroyOfflineStream(const SherpaOnnxOfflineStream *stream) {
  delete stream;
 }
-void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream, int32_t sample_rate,
+void AcceptWaveformOffline(const SherpaOnnxOfflineStream *stream,
-                           const float *samples, int32_t n) {
+                           int32_t sample_rate, const float *samples,
                           int32_t n) {
  stream->impl->AcceptWaveform(sample_rate, samples, n);
 }
-void DecodeOfflineStream(SherpaOnnxOfflineRecognizer *recognizer,
+void DecodeOfflineStream(const SherpaOnnxOfflineRecognizer *recognizer,
-                         SherpaOnnxOfflineStream *stream) {
+                         const SherpaOnnxOfflineStream *stream) {
  recognizer->impl->DecodeStream(stream->impl.get());
 }
@@ -1209,3 +1213,89 @@ void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(
  delete[] names;
 }
 struct SherpaOnnxAudioTagging {
  std::unique_ptr<sherpa_onnx::AudioTagging> impl;
 };
 const SherpaOnnxAudioTagging *SherpaOnnxCreateAudioTagging(
    const SherpaOnnxAudioTaggingConfig *config) {
  sherpa_onnx::AudioTaggingConfig ac;
  ac.model.zipformer.model = SHERPA_ONNX_OR(config->model.zipformer.model, "");
  ac.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
  ac.model.debug = config->model.debug;
  ac.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
  ac.labels = SHERPA_ONNX_OR(config->labels, "");
  ac.top_k = SHERPA_ONNX_OR(config->top_k, 5);
  if (ac.model.debug) {
    SHERPA_ONNX_LOGE("%s\n", ac.ToString().c_str());
  }
  if (!ac.Validate()) {
    SHERPA_ONNX_LOGE("Errors in config");
    return nullptr;
  }
  SherpaOnnxAudioTagging *tagger = new SherpaOnnxAudioTagging;
  tagger->impl = std::make_unique<sherpa_onnx::AudioTagging>(ac);
  return tagger;
 }
 void SherpaOnnxDestroyAudioTagging(const SherpaOnnxAudioTagging *tagger) {
  delete tagger;
 }
 const SherpaOnnxOfflineStream *SherpaOnnxAudioTaggingCreateOfflineStream(
    const SherpaOnnxAudioTagging *tagger) {
  const SherpaOnnxOfflineStream *stream =
      new SherpaOnnxOfflineStream(tagger->impl->CreateStream());
  return stream;
 }
 const SherpaOnnxAudioEvent *const *SherpaOnnxAudioTaggingCompute(
    const SherpaOnnxAudioTagging *tagger, const SherpaOnnxOfflineStream *s,
    int32_t top_k) {
  std::vector<sherpa_onnx::AudioEvent> events =
      tagger->impl->Compute(s->impl.get(), top_k);
  int32_t n = static_cast<int32_t>(events.size());
  SherpaOnnxAudioEvent **ans = new SherpaOnnxAudioEvent *[n + 1];
  ans[n] = nullptr;
  int32_t i = 0;
  for (const auto &e : events) {
    SherpaOnnxAudioEvent *p = new SherpaOnnxAudioEvent;
    char *name = new char[e.name.size() + 1];
    std::copy(e.name.begin(), e.name.end(), name);
    name[e.name.size()] = 0;
    p->name = name;
    p->index = e.index;
    p->prob = e.prob;
    ans[i] = p;
    i += 1;
  }
  return ans;
 }
 void SherpaOnnxAudioTaggingFreeResults(
    const SherpaOnnxAudioEvent *const *events) {
  auto p = events;
  while (p && *p) {
    auto e = *p;
    delete[] e->name;
    delete e;
    ++p;
  }
  delete[] events;
 }
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -427,7 +427,8 @@ SHERPA_ONNX_API SherpaOnnxOfflineStream *CreateOfflineStream(
 /// Destroy an offline stream.
 ///
 /// @param stream A pointer returned by CreateOfflineStream()
-SHERPA_ONNX_API void DestroyOfflineStream(SherpaOnnxOfflineStream *stream);
+SHERPA_ONNX_API void DestroyOfflineStream(
    const SherpaOnnxOfflineStream *stream);
 /// Accept input audio samples and compute the features.
 /// The user has to invoke DecodeOfflineStream() to run the neural network and
@@ -442,9 +443,9 @@ SHERPA_ONNX_API void DestroyOfflineStream(SherpaOnnxOfflineStream *stream);
 /// @param n  Number of elements in the samples array.
 ///
 /// @caution: For each offline stream, please invoke this function only once!
-SHERPA_ONNX_API void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream,
+SHERPA_ONNX_API void AcceptWaveformOffline(
-                                           int32_t sample_rate,
+    const SherpaOnnxOfflineStream *stream, int32_t sample_rate,
-                                           const float *samples, int32_t n);
+    const float *samples, int32_t n);
 /// Decode an offline stream.
 ///
 /// We assume you have invoked AcceptWaveformOffline() for the given stream
@@ -453,7 +454,8 @@ SHERPA_ONNX_API void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream,
 /// @param recognizer A pointer returned by CreateOfflineRecognizer().
 /// @param stream A pointer returned by CreateOfflineStream()
 SHERPA_ONNX_API void DecodeOfflineStream(
-    SherpaOnnxOfflineRecognizer *recognizer, SherpaOnnxOfflineStream *stream);
+    const SherpaOnnxOfflineRecognizer *recognizer,
    const SherpaOnnxOfflineStream *stream);
 /// Decode a list offline streams in parallel.
 ///
@@ -1088,6 +1090,65 @@ SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(
 SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(
    const char *const *names);
 // ============================================================
 // For audio tagging
 // ============================================================
 SHERPA_ONNX_API typedef struct
    SherpaOnnxOfflineZipformerAudioTaggingModelConfig {
  const char *model;
 } SherpaOnnxOfflineZipformerAudioTaggingModelConfig;
 SHERPA_ONNX_API typedef struct SherpaOnnxAudioTaggingModelConfig {
  SherpaOnnxOfflineZipformerAudioTaggingModelConfig zipformer;
  int32_t num_threads;
  int32_t debug;  // true to print debug information of the model
  const char *provider;
 } SherpaOnnxAudioTaggingModelConfig;
 SHERPA_ONNX_API typedef struct SherpaOnnxAudioTaggingConfig {
  SherpaOnnxAudioTaggingModelConfig model;
  const char *labels;
  int32_t top_k;
 } SherpaOnnxAudioTaggingConfig;
 SHERPA_ONNX_API typedef struct SherpaOnnxAudioEvent {
  const char *name;
  int32_t index;
  float prob;
 } SherpaOnnxAudioEvent;
 SHERPA_ONNX_API typedef struct SherpaOnnxAudioTagging SherpaOnnxAudioTagging;
 // The user has to invoke
 // SherpaOnnxDestroyAudioTagging()
 // to free the returned pointer to avoid memory leak
 SHERPA_ONNX_API const SherpaOnnxAudioTagging *SherpaOnnxCreateAudioTagging(
    const SherpaOnnxAudioTaggingConfig *config);
 SHERPA_ONNX_API void SherpaOnnxDestroyAudioTagging(
    const SherpaOnnxAudioTagging *tagger);
 // The user has to invoke DestroyOfflineStream()
 // to free the returned pointer to avoid memory leak
 SHERPA_ONNX_API const SherpaOnnxOfflineStream *
 SherpaOnnxAudioTaggingCreateOfflineStream(const SherpaOnnxAudioTagging *tagger);
 // Return an array of pointers. The length of the array is top_k + 1.
 // If top_k is -1, then config.top_k is used, where config is the config
 // used to create the input tagger.
 //
 // The ans[0]->prob has the largest probability among the array elements
 // The last element of the array is a null pointer
 //
 // The user has to use SherpaOnnxAudioTaggingFreeResults()
 // to free the returned pointer to avoid memory leak
 SHERPA_ONNX_API const SherpaOnnxAudioEvent *const *
 SherpaOnnxAudioTaggingCompute(const SherpaOnnxAudioTagging *tagger,
                              const SherpaOnnxOfflineStream *s, int32_t top_k);
 SHERPA_ONNX_API void SherpaOnnxAudioTaggingFreeResults(
    const SherpaOnnxAudioEvent *const *p);
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif