Add C API for audio tagging (#754)

2024-04-11 14:18:43 +08:00
parent 34d70a259f
commit f204e62b44
9 changed files with 289 additions and 36 deletions
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -10,6 +10,7 @@
 #include <utility>
 #include <vector>

+#include "sherpa-onnx/csrc/audio-tagging.h"
 #include "sherpa-onnx/csrc/circular-buffer.h"
 #include "sherpa-onnx/csrc/display.h"
 #include "sherpa-onnx/csrc/keyword-spotter.h"
@@ -400,15 +401,18 @@ SherpaOnnxOfflineStream *CreateOfflineStream(
  return stream;
 }

-void DestroyOfflineStream(SherpaOnnxOfflineStream *stream) { delete stream; }
+void DestroyOfflineStream(const SherpaOnnxOfflineStream *stream) {
+  delete stream;
+}

-void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream, int32_t sample_rate,
-                           const float *samples, int32_t n) {
+void AcceptWaveformOffline(const SherpaOnnxOfflineStream *stream,
+                           int32_t sample_rate, const float *samples,
+                           int32_t n) {
  stream->impl->AcceptWaveform(sample_rate, samples, n);
 }

-void DecodeOfflineStream(SherpaOnnxOfflineRecognizer *recognizer,
-                         SherpaOnnxOfflineStream *stream) {
+void DecodeOfflineStream(const SherpaOnnxOfflineRecognizer *recognizer,
+                         const SherpaOnnxOfflineStream *stream) {
  recognizer->impl->DecodeStream(stream->impl.get());
 }

@@ -1209,3 +1213,89 @@ void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(

  delete[] names;
 }
+
+struct SherpaOnnxAudioTagging {
+  std::unique_ptr<sherpa_onnx::AudioTagging> impl;
+};
+
+const SherpaOnnxAudioTagging *SherpaOnnxCreateAudioTagging(
+    const SherpaOnnxAudioTaggingConfig *config) {
+  sherpa_onnx::AudioTaggingConfig ac;
+  ac.model.zipformer.model = SHERPA_ONNX_OR(config->model.zipformer.model, "");
+  ac.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
+  ac.model.debug = config->model.debug;
+  ac.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
+  ac.labels = SHERPA_ONNX_OR(config->labels, "");
+  ac.top_k = SHERPA_ONNX_OR(config->top_k, 5);
+
+  if (ac.model.debug) {
+    SHERPA_ONNX_LOGE("%s\n", ac.ToString().c_str());
+  }
+
+  if (!ac.Validate()) {
+    SHERPA_ONNX_LOGE("Errors in config");
+    return nullptr;
+  }
+
+  SherpaOnnxAudioTagging *tagger = new SherpaOnnxAudioTagging;
+  tagger->impl = std::make_unique<sherpa_onnx::AudioTagging>(ac);
+
+  return tagger;
+}
+
+void SherpaOnnxDestroyAudioTagging(const SherpaOnnxAudioTagging *tagger) {
+  delete tagger;
+}
+
+const SherpaOnnxOfflineStream *SherpaOnnxAudioTaggingCreateOfflineStream(
+    const SherpaOnnxAudioTagging *tagger) {
+  const SherpaOnnxOfflineStream *stream =
+      new SherpaOnnxOfflineStream(tagger->impl->CreateStream());
+  return stream;
+}
+
+const SherpaOnnxAudioEvent *const *SherpaOnnxAudioTaggingCompute(
+    const SherpaOnnxAudioTagging *tagger, const SherpaOnnxOfflineStream *s,
+    int32_t top_k) {
+  std::vector<sherpa_onnx::AudioEvent> events =
+      tagger->impl->Compute(s->impl.get(), top_k);
+
+  int32_t n = static_cast<int32_t>(events.size());
+  SherpaOnnxAudioEvent **ans = new SherpaOnnxAudioEvent *[n + 1];
+  ans[n] = nullptr;
+
+  int32_t i = 0;
+  for (const auto &e : events) {
+    SherpaOnnxAudioEvent *p = new SherpaOnnxAudioEvent;
+
+    char *name = new char[e.name.size() + 1];
+    std::copy(e.name.begin(), e.name.end(), name);
+    name[e.name.size()] = 0;
+
+    p->name = name;
+
+    p->index = e.index;
+    p->prob = e.prob;
+
+    ans[i] = p;
+    i += 1;
+  }
+
+  return ans;
+}
+
+void SherpaOnnxAudioTaggingFreeResults(
+    const SherpaOnnxAudioEvent *const *events) {
+  auto p = events;
+
+  while (p && *p) {
+    auto e = *p;
+
+    delete[] e->name;
+    delete e;
+
+    ++p;
+  }
+
+  delete[] events;
+}
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -427,7 +427,8 @@ SHERPA_ONNX_API SherpaOnnxOfflineStream *CreateOfflineStream(
 /// Destroy an offline stream.
 ///
 /// @param stream A pointer returned by CreateOfflineStream()
-SHERPA_ONNX_API void DestroyOfflineStream(SherpaOnnxOfflineStream *stream);
+SHERPA_ONNX_API void DestroyOfflineStream(
+    const SherpaOnnxOfflineStream *stream);

 /// Accept input audio samples and compute the features.
 /// The user has to invoke DecodeOfflineStream() to run the neural network and
@@ -442,9 +443,9 @@ SHERPA_ONNX_API void DestroyOfflineStream(SherpaOnnxOfflineStream *stream);
 /// @param n  Number of elements in the samples array.
 ///
 /// @caution: For each offline stream, please invoke this function only once!
-SHERPA_ONNX_API void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream,
-                                           int32_t sample_rate,
-                                           const float *samples, int32_t n);
+SHERPA_ONNX_API void AcceptWaveformOffline(
+    const SherpaOnnxOfflineStream *stream, int32_t sample_rate,
+    const float *samples, int32_t n);
 /// Decode an offline stream.
 ///
 /// We assume you have invoked AcceptWaveformOffline() for the given stream
@@ -453,7 +454,8 @@ SHERPA_ONNX_API void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream,
 /// @param recognizer A pointer returned by CreateOfflineRecognizer().
 /// @param stream A pointer returned by CreateOfflineStream()
 SHERPA_ONNX_API void DecodeOfflineStream(
-    SherpaOnnxOfflineRecognizer *recognizer, SherpaOnnxOfflineStream *stream);
+    const SherpaOnnxOfflineRecognizer *recognizer,
+    const SherpaOnnxOfflineStream *stream);

 /// Decode a list offline streams in parallel.
 ///
@@ -1088,6 +1090,65 @@ SherpaOnnxSpeakerEmbeddingManagerGetAllSpeakers(
 SHERPA_ONNX_API void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(
    const char *const *names);

+// ============================================================
+// For audio tagging
+// ============================================================
+SHERPA_ONNX_API typedef struct
+    SherpaOnnxOfflineZipformerAudioTaggingModelConfig {
+  const char *model;
+} SherpaOnnxOfflineZipformerAudioTaggingModelConfig;
+
+SHERPA_ONNX_API typedef struct SherpaOnnxAudioTaggingModelConfig {
+  SherpaOnnxOfflineZipformerAudioTaggingModelConfig zipformer;
+  int32_t num_threads;
+  int32_t debug;  // true to print debug information of the model
+  const char *provider;
+} SherpaOnnxAudioTaggingModelConfig;
+
+SHERPA_ONNX_API typedef struct SherpaOnnxAudioTaggingConfig {
+  SherpaOnnxAudioTaggingModelConfig model;
+  const char *labels;
+  int32_t top_k;
+} SherpaOnnxAudioTaggingConfig;
+
+SHERPA_ONNX_API typedef struct SherpaOnnxAudioEvent {
+  const char *name;
+  int32_t index;
+  float prob;
+} SherpaOnnxAudioEvent;
+
+SHERPA_ONNX_API typedef struct SherpaOnnxAudioTagging SherpaOnnxAudioTagging;
+
+// The user has to invoke
+// SherpaOnnxDestroyAudioTagging()
+// to free the returned pointer to avoid memory leak
+SHERPA_ONNX_API const SherpaOnnxAudioTagging *SherpaOnnxCreateAudioTagging(
+    const SherpaOnnxAudioTaggingConfig *config);
+
+SHERPA_ONNX_API void SherpaOnnxDestroyAudioTagging(
+    const SherpaOnnxAudioTagging *tagger);
+
+// The user has to invoke DestroyOfflineStream()
+// to free the returned pointer to avoid memory leak
+SHERPA_ONNX_API const SherpaOnnxOfflineStream *
+SherpaOnnxAudioTaggingCreateOfflineStream(const SherpaOnnxAudioTagging *tagger);
+
+// Return an array of pointers. The length of the array is top_k + 1.
+// If top_k is -1, then config.top_k is used, where config is the config
+// used to create the input tagger.
+//
+// The ans[0]->prob has the largest probability among the array elements
+// The last element of the array is a null pointer
+//
+// The user has to use SherpaOnnxAudioTaggingFreeResults()
+// to free the returned pointer to avoid memory leak
+SHERPA_ONNX_API const SherpaOnnxAudioEvent *const *
+SherpaOnnxAudioTaggingCompute(const SherpaOnnxAudioTagging *tagger,
+                              const SherpaOnnxOfflineStream *s, int32_t top_k);
+
+SHERPA_ONNX_API void SherpaOnnxAudioTaggingFreeResults(
+    const SherpaOnnxAudioEvent *const *p);
+
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif