Add C API for spoken language identification. (#695)

2024-03-25 15:16:47 +08:00
parent 0d258dd150
commit ab7cff2513
18 changed files with 366 additions and 70 deletions
--- a/c-api-examples/CMakeLists.txt
+++ b/c-api-examples/CMakeLists.txt
@@ -7,8 +7,11 @@ target_link_libraries(decode-file-c-api sherpa-onnx-c-api cargs)
 add_executable(offline-tts-c-api offline-tts-c-api.c)
 target_link_libraries(offline-tts-c-api sherpa-onnx-c-api cargs)

+add_executable(spoken-language-identification-c-api spoken-language-identification-c-api.c)
+target_link_libraries(spoken-language-identification-c-api sherpa-onnx-c-api)
+
 if(SHERPA_ONNX_HAS_ALSA)
  add_subdirectory(./asr-microphone-example)
-else()
+elseif((UNIX AND NOT APPLE) OR LINUX)
  message(WARNING "Not include ./asr-microphone-example since alsa is not available")
 endif()
--- a/c-api-examples/Makefile
+++ b/c-api-examples/Makefile
@@ -4,7 +4,7 @@ CUR_DIR :=$(shell pwd)
 CFLAGS := -I ../ -I ../build/_deps/cargs-src/include/
 LDFLAGS := -L ../build/lib
 LDFLAGS += -L ../build/_deps/onnxruntime-src/lib
-LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lcargs
+LDFLAGS += -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -lonnxruntime
 LDFLAGS += -framework Foundation
 LDFLAGS += -lc++
 LDFLAGS += -Wl,-rpath,${CUR_DIR}/../build/lib
--- a/c-api-examples/decode-file-c-api.c
+++ b/c-api-examples/decode-file-c-api.c
@@ -169,55 +169,56 @@ int32_t main(int32_t argc, char *argv[]) {
  int32_t segment_id = 0;

  const char *wav_filename = argv[context.index];
-  FILE *fp = fopen(wav_filename, "rb");
-  if (!fp) {
-    fprintf(stderr, "Failed to open %s\n", wav_filename);
+  const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
+  if (wave == NULL) {
+    fprintf(stderr, "Failed to read %s\n", wav_filename);
    return -1;
  }
-
-  // Assume the wave header occupies 44 bytes.
-  fseek(fp, 44, SEEK_SET);
-
  // simulate streaming

 #define N 3200  // 0.2 s. Sample rate is fixed to 16 kHz

  int16_t buffer[N];
  float samples[N];
+  fprintf(stderr, "sample rate: %d, num samples: %d, duration: %.2f s\n",
+          wave->sample_rate, wave->num_samples,
+          (float)wave->num_samples / wave->sample_rate);

-  while (!feof(fp)) {
-    size_t n = fread((void *)buffer, sizeof(int16_t), N, fp);
-    if (n > 0) {
-      for (size_t i = 0; i != n; ++i) {
-        samples[i] = buffer[i] / 32768.;
-      }
-      AcceptWaveform(stream, 16000, samples, n);
-      while (IsOnlineStreamReady(recognizer, stream)) {
-        DecodeOnlineStream(recognizer, stream);
-      }
+  int32_t k = 0;
+  while (k < wave->num_samples) {
+    int32_t start = k;
+    int32_t end =
+        (start + N > wave->num_samples) ? wave->num_samples : (start + N);
+    k += N;

-      const SherpaOnnxOnlineRecognizerResult *r =
-          GetOnlineStreamResult(recognizer, stream);
-
-      if (strlen(r->text)) {
-        SherpaOnnxPrint(display, segment_id, r->text);
-      }
-
-      if (IsEndpoint(recognizer, stream)) {
-        if (strlen(r->text)) {
-          ++segment_id;
-        }
-        Reset(recognizer, stream);
-      }
-
-      DestroyOnlineRecognizerResult(r);
+    AcceptWaveform(stream, wave->sample_rate, wave->samples + start,
+                   end - start);
+    while (IsOnlineStreamReady(recognizer, stream)) {
+      DecodeOnlineStream(recognizer, stream);
    }
+
+    const SherpaOnnxOnlineRecognizerResult *r =
+        GetOnlineStreamResult(recognizer, stream);
+
+    if (strlen(r->text)) {
+      SherpaOnnxPrint(display, segment_id, r->text);
+    }
+
+    if (IsEndpoint(recognizer, stream)) {
+      if (strlen(r->text)) {
+        ++segment_id;
+      }
+      Reset(recognizer, stream);
+    }
+
+    DestroyOnlineRecognizerResult(r);
  }
-  fclose(fp);

  // add some tail padding
  float tail_paddings[4800] = {0};  // 0.3 seconds at 16 kHz sample rate
-  AcceptWaveform(stream, 16000, tail_paddings, 4800);
+  AcceptWaveform(stream, wave->sample_rate, tail_paddings, 4800);
+
+  SherpaOnnxFreeWave(wave);

  InputFinished(stream);
  while (IsOnlineStreamReady(recognizer, stream)) {
--- a/c-api-examples/spoken-language-identification-c-api.c
+++ b/c-api-examples/spoken-language-identification-c-api.c
@@ -0,0 +1,65 @@
+
+// We assume you have pre-downloaded the whisper multi-lingual models
+// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+// An example command to download the "tiny" whisper model is given below:
+//
+// clang-format off
+//
+// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+// tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+// rm sherpa-onnx-whisper-tiny.tar.bz2
+//
+// clang-format on
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sherpa-onnx/c-api/c-api.h"
+
+int32_t main() {
+  SherpaOnnxSpokenLanguageIdentificationConfig config;
+
+  memset(&config, 0, sizeof(config));
+
+  config.whisper.encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx";
+  config.whisper.decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx";
+  config.num_threads = 1;
+  config.debug = 1;
+  config.provider = "cpu";
+
+  const SherpaOnnxSpokenLanguageIdentification *slid =
+      SherpaOnnxCreateSpokenLanguageIdentification(&config);
+  if (!slid) {
+    fprintf(stderr, "Failed to create spoken language identifier");
+    return -1;
+  }
+
+  // You can find more test waves from
+  // https://hf-mirror.com/spaces/k2-fsa/spoken-language-identification/tree/main/test_wavs
+  const char *wav_filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav";
+  const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
+  if (wave == NULL) {
+    fprintf(stderr, "Failed to read %s\n", wav_filename);
+    return -1;
+  }
+
+  SherpaOnnxOfflineStream *stream =
+      SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid);
+
+  AcceptWaveformOffline(stream, wave->sample_rate, wave->samples,
+                        wave->num_samples);
+
+  const SherpaOnnxSpokenLanguageIdentificationResult *result =
+      SherpaOnnxSpokenLanguageIdentificationCompute(slid, stream);
+
+  fprintf(stderr, "wav_filename: %s\n", wav_filename);
+  fprintf(stderr, "Detected language: %s\n", result->lang);
+
+  SherpaOnnxDestroySpokenLanguageIdentificationResult(result);
+  DestroyOfflineStream(stream);
+  SherpaOnnxFreeWave(wave);
+  SherpaOnnxDestroySpokenLanguageIdentification(slid);
+
+  return 0;
+}