Add C API for streaming HLG decoding (#734)

2024-04-05 10:31:20 +08:00
parent db67e00c77
commit dbff2eaadb
39 changed files with 839 additions and 8 deletions
--- a/c-api-examples/CMakeLists.txt
+++ b/c-api-examples/CMakeLists.txt
@@ -15,6 +15,9 @@ target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
 add_executable(speaker-identification-c-api speaker-identification-c-api.c)
 target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)

+add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c)
+target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api)
+
 if(SHERPA_ONNX_HAS_ALSA)
  add_subdirectory(./asr-microphone-example)
 elseif((UNIX AND NOT APPLE) OR LINUX)
--- a/c-api-examples/streaming-hlg-decode-file-c-api.c
+++ b/c-api-examples/streaming-hlg-decode-file-c-api.c
@@ -0,0 +1,130 @@
+// c-api-examples/streaming-hlg-decode-file-c-api.c
+//
+// Copyright (c)  2024  Xiaomi Corporation
+/*
+We use the following model as an example
+
+// clang-format off
+
+Download the model from
+https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
+
+tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
+rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
+
+build/bin/streaming-hlg-decode-file-c-api
+
+(The above model is from https://github.com/k2-fsa/icefall/pull/1557)
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sherpa-onnx/c-api/c-api.h"
+
+int32_t main() {
+  // clang-format off
+  //
+  // Please download the model from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
+  const char *model = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx";
+  const char *tokens = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt";
+  const char *graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
+  const char *wav_filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
+  // clang-format on
+
+  SherpaOnnxOnlineRecognizerConfig config;
+
+  memset(&config, 0, sizeof(config));
+  config.feat_config.sample_rate = 16000;
+  config.feat_config.feature_dim = 80;
+  config.model_config.zipformer2_ctc.model = model;
+  config.model_config.tokens = tokens;
+  config.model_config.num_threads = 1;
+  config.model_config.provider = "cpu";
+  config.model_config.debug = 0;
+  config.ctc_fst_decoder_config.graph = graph;
+  const SherpaOnnxOnlineRecognizer *recognizer =
+      CreateOnlineRecognizer(&config);
+  if (!recognizer) {
+    fprintf(stderr, "Failed to create recognizer");
+    exit(-1);
+  }
+
+  const SherpaOnnxOnlineStream *stream = CreateOnlineStream(recognizer);
+
+  const SherpaOnnxDisplay *display = CreateDisplay(50);
+  int32_t segment_id = 0;
+
+  const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
+  if (wave == NULL) {
+    fprintf(stderr, "Failed to read %s\n", wav_filename);
+    exit(-1);
+  }
+
+// simulate streaming. You can choose an arbitrary N
+#define N 3200
+
+  int16_t buffer[N];
+  float samples[N];
+  fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
+          wave->sample_rate, wave->num_samples,
+          (float)wave->num_samples / wave->sample_rate);
+
+  int32_t k = 0;
+  while (k < wave->num_samples) {
+    int32_t start = k;
+    int32_t end =
+        (start + N > wave->num_samples) ? wave->num_samples : (start + N);
+    k += N;
+
+    AcceptWaveform(stream, wave->sample_rate, wave->samples + start,
+                   end - start);
+    while (IsOnlineStreamReady(recognizer, stream)) {
+      DecodeOnlineStream(recognizer, stream);
+    }
+
+    const SherpaOnnxOnlineRecognizerResult *r =
+        GetOnlineStreamResult(recognizer, stream);
+
+    if (strlen(r->text)) {
+      SherpaOnnxPrint(display, segment_id, r->text);
+    }
+
+    if (IsEndpoint(recognizer, stream)) {
+      if (strlen(r->text)) {
+        ++segment_id;
+      }
+      Reset(recognizer, stream);
+    }
+
+    DestroyOnlineRecognizerResult(r);
+  }
+
+  // add some tail padding
+  float tail_paddings[4800] = {0};  // 0.3 seconds at 16 kHz sample rate
+  AcceptWaveform(stream, wave->sample_rate, tail_paddings, 4800);
+
+  SherpaOnnxFreeWave(wave);
+
+  InputFinished(stream);
+  while (IsOnlineStreamReady(recognizer, stream)) {
+    DecodeOnlineStream(recognizer, stream);
+  }
+
+  const SherpaOnnxOnlineRecognizerResult *r =
+      GetOnlineStreamResult(recognizer, stream);
+
+  if (strlen(r->text)) {
+    SherpaOnnxPrint(display, segment_id, r->text);
+  }
+
+  DestroyOnlineRecognizerResult(r);
+
+  DestroyDisplay(display);
+  DestroyOnlineStream(stream);
+  DestroyOnlineRecognizer(recognizer);
+  fprintf(stderr, "\n");
+
+  return 0;
+}