Fix various language binding APIs for tdnn and whisper models (#278)

2023-08-16 22:15:10 +08:00
parent 3ab135c1eb
commit e31f9e48c2
16 changed files with 249 additions and 14 deletions
--- a/.github/workflows/go.yaml
+++ b/.github/workflows/go.yaml
@@ -67,7 +67,7 @@ jobs:
          ls -lh
          go mod tidy
          cat go.mod
-          go build -x
+          go build
          ls -lh

          git lfs install
@@ -87,6 +87,19 @@ jobs:
          ./run-nemo-ctc.sh
          rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium

+          echo "Test Whisper tiny.en"
+          GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en
+          cd sherpa-onnx-whisper-tiny.en
+          git lfs pull --include "*.onnx"
+          cd ..
+          ./run-whisper.sh
+          rm -rf sherpa-onnx-whisper-tiny.en
+
+          echo "Test Tdnn yesno"
+          git clone https://huggingface.co/csukuangfj/sherpa-onnx-tdnn-yesno
+          ./run-tdnn-yesno.sh
+          rm -rf sherpa-onnx-tdnn-yesno
+
      - name: Test non-streaming decoding files (Win64)
        if: matrix.os == 'windows-latest' && matrix.arch == 'x64'
        shell: bash
@@ -121,6 +134,19 @@ jobs:
          ./run-nemo-ctc.sh
          rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium

+          echo "Test Whisper tiny.en"
+          GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en
+          cd sherpa-onnx-whisper-tiny.en
+          git lfs pull --include "*.onnx"
+          cd ..
+          ./run-whisper.sh
+          rm -rf sherpa-onnx-whisper-tiny.en
+
+          echo "Test Tdnn yesno"
+          git clone https://huggingface.co/csukuangfj/sherpa-onnx-tdnn-yesno
+          ./run-tdnn-yesno.sh
+          rm -rf sherpa-onnx-tdnn-yesno
+
      - name: Test non-streaming decoding files (Win32)
        if: matrix.os == 'windows-latest' && matrix.arch == 'x86'
        shell: bash
@@ -139,7 +165,7 @@ jobs:
          go env

          go clean
-          go build -x
+          go build

          echo $PWD
          ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
@@ -163,6 +189,19 @@ jobs:
          ./run-nemo-ctc.sh
          rm -rf sherpa-onnx-nemo-ctc-en-conformer-medium

+          echo "Test Whisper tiny.en"
+          GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en
+          cd sherpa-onnx-whisper-tiny.en
+          git lfs pull --include "*.onnx"
+          cd ..
+          ./run-whisper.sh
+          rm -rf sherpa-onnx-whisper-tiny.en
+
+          echo "Test Tdnn yesno"
+          git clone https://huggingface.co/csukuangfj/sherpa-onnx-tdnn-yesno
+          ./run-tdnn-yesno.sh
+          rm -rf sherpa-onnx-tdnn-yesno
+
      - name: Test streaming decoding files (Linux/macOS)
        if: matrix.os != 'windows-latest'
        shell: bash
@@ -171,7 +210,7 @@ jobs:
          ls -lh
          go mod tidy
          cat go.mod
-          go build -x
+          go build
          ls -lh

          git lfs install
@@ -233,7 +272,7 @@ jobs:
          go env

          go clean
-          go build -x
+          go build

          echo $PWD
          ls -lh /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/
--- a/.github/workflows/test-dot-net.yaml
+++ b/.github/workflows/test-dot-net.yaml
@@ -72,3 +72,5 @@ jobs:
          ./run-nemo-ctc.sh
          ./run-paraformer.sh
          ./run-zipformer.sh
+          ./run-whisper.sh
+          ./run-tdnn-yesno.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-onnx)

-set(SHERPA_ONNX_VERSION "1.7.7")
+set(SHERPA_ONNX_VERSION "1.7.8")

 # Disable warning about
 #
--- a/dotnet-examples/offline-decode-files/Program.cs
+++ b/dotnet-examples/offline-decode-files/Program.cs
@@ -15,18 +15,35 @@ class OfflineDecodeFiles
 {
  class Options
  {
+
+    [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")]
+    public int SampleRate { get; set; }
+
+    [Option("feat-dim", Required = false, Default = 80, HelpText = "Dimension of the features used to train the model")]
+    public int FeatureDim { get; set; }
+
    [Option(Required = false, HelpText = "Path to tokens.txt")]
    public string Tokens { get; set; }

-    [Option(Required = false, HelpText = "Path to encoder.onnx. Used only for transducer models")]
+    [Option(Required = false, Default = "", HelpText = "Path to transducer encoder.onnx. Used only for transducer models")]
    public string Encoder { get; set; }

-    [Option(Required = false, HelpText = "Path to decoder.onnx. Used only for transducer models")]
+    [Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")]
    public string Decoder { get; set; }

-    [Option(Required = false, HelpText = "Path to joiner.onnx. Used only for transducer models")]
+    [Option(Required = false,  Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")]
    public string Joiner { get; set; }

+    [Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")]
+    public string WhisperEncoder { get; set; }
+
+    [Option("whisper-decoder", Required = false, Default = "", HelpText = "Path to whisper decoder.onnx. Used only for whisper models")]
+    public string WhisperDecoder { get; set; }
+
+    [Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")]
+    public string TdnnModel { get; set; }
+
+
    [Option(Required = false, HelpText = "Path to model.onnx. Used only for paraformer models")]
    public string Paraformer { get; set; }

@@ -105,6 +122,38 @@ dotnet run \
 Please refer to
 https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/index.html
 to download pre-trained paraformer models
+
+# Whisper
+
+dotnet run \
+  --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx \
+  --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx \
+  --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \
+  --files ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav \
+  ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav \
+  ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav
+
+Please refer to
+https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html
+to download pre-trained whisper models.
+
+# Tdnn yesno
+
+dotnet run \
+  --sample-rate=8000 \
+  --feat-dim=23 \
+  --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt \
+  --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \
+  --files ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_0_1_0_0_0_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_0_1_0.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_1_1_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_1_0_0_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_0_0_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_1_1_0.wav
+
+Please refer to
+https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/yesno/index.html
+to download pre-trained Tdnn models.
 ";

    var helpText = HelpText.AutoBuild(result, h =>
@@ -120,6 +169,9 @@ to download pre-trained paraformer models
  private static void Run(Options options)
  {
    OfflineRecognizerConfig config = new OfflineRecognizerConfig();
+    config.FeatConfig.SampleRate = options.SampleRate;
+    config.FeatConfig.FeatureDim = options.FeatureDim;
+
    config.ModelConfig.Tokens = options.Tokens;

    if (!String.IsNullOrEmpty(options.Encoder))
@@ -137,6 +189,15 @@ to download pre-trained paraformer models
    {
      config.ModelConfig.NeMoCtc.Model = options.NeMoCtc;
    }
+    else if (!String.IsNullOrEmpty(options.WhisperEncoder))
+    {
+      config.ModelConfig.Whisper.Encoder = options.WhisperEncoder;
+      config.ModelConfig.Whisper.Decoder = options.WhisperDecoder;
+    }
+    else if (!String.IsNullOrEmpty(options.TdnnModel))
+    {
+      config.ModelConfig.Tdnn.Model = options.TdnnModel;
+    }
    else
    {
      Console.WriteLine("Please provide a model");
--- a/dotnet-examples/offline-decode-files/run-tdnn-yesno.sh
+++ b/dotnet-examples/offline-decode-files/run-tdnn-yesno.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+if [ ! -d ./sherpa-onnx-tdnn-yesno ]; then
+  GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-tdnn-yesno
+  cd sherpa-onnx-tdnn-yesno
+  git lfs pull --include "*.onnx"
+  cd ..
+fi
+
+dotnet run \
+  --sample-rate=8000 \
+  --feat-dim=23 \
+  --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt \
+  --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \
+  --files ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_0_1_0_0_0_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_0_1_0.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_0_1_1_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_0_1_0_0_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_0_0_1.wav \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_1_1_0_1_1_0.wav
--- a/dotnet-examples/offline-decode-files/run-whisper.sh
+++ b/dotnet-examples/offline-decode-files/run-whisper.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+if [ ! -d ./sherpa-onnx-whisper-tiny.en ]; then
+  GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-tiny.en
+  cd sherpa-onnx-whisper-tiny.en
+  git lfs pull --include "*.onnx"
+  cd ..
+fi
+
+dotnet run \
+  --num-threads=2 \
+  --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx \
+  --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx \
+  --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \
+  --files ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav \
+  ./sherpa-onnx-whisper-tiny.en/test_wavs/1.wav \
+  ./sherpa-onnx-whisper-tiny.en/test_wavs/8k.wav
--- a/go-api-examples/non-streaming-decode-files/main.go
+++ b/go-api-examples/non-streaming-decode-files/main.go
@@ -15,13 +15,23 @@ func main() {
 	log.SetFlags(log.LstdFlags | log.Lmicroseconds)

 	config := sherpa.OfflineRecognizerConfig{}
-	config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}

-	flag.StringVar(&config.ModelConfig.Transducer.Encoder, "encoder", "", "Path to the encoder model")
-	flag.StringVar(&config.ModelConfig.Transducer.Decoder, "decoder", "", "Path to the decoder model")
+	flag.IntVar(&config.FeatConfig.SampleRate, "sample-rate", 16000, "Sample rate of the data used to train the model")
+	flag.IntVar(&config.FeatConfig.FeatureDim, "feat-dim", 80, "Dimension of the features used to train the model")
+
+	flag.StringVar(&config.ModelConfig.Transducer.Encoder, "encoder", "", "Path to the transducer encoder model")
+	flag.StringVar(&config.ModelConfig.Transducer.Decoder, "decoder", "", "Path to the transducer decoder model")
 	flag.StringVar(&config.ModelConfig.Transducer.Joiner, "joiner", "", "Path to the joiner model")
+
 	flag.StringVar(&config.ModelConfig.Paraformer.Model, "paraformer", "", "Path to the paraformer model")
+
 	flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model")
+
+	flag.StringVar(&config.ModelConfig.Whisper.Encoder, "whisper-encoder", "", "Path to the whisper encoder model")
+	flag.StringVar(&config.ModelConfig.Whisper.Decoder, "whisper-decoder", "", "Path to the whisper decoder model")
+
+	flag.StringVar(&config.ModelConfig.Tdnn.Model, "tdnn-model", "", "Path to the tdnn model")
+
 	flag.StringVar(&config.ModelConfig.Tokens, "tokens", "", "Path to the tokens file")
 	flag.IntVar(&config.ModelConfig.NumThreads, "num-threads", 1, "Number of threads for computing")
 	flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message")
--- a/go-api-examples/non-streaming-decode-files/run-nemo-ctc.sh
+++ b/go-api-examples/non-streaming-decode-files/run-nemo-ctc.sh
@@ -5,7 +5,7 @@
 # to download the model
 # before you run this script.
 #
-# You can switch to a different online model if you need
+# You can switch to a different offline model if you need

 ./non-streaming-decode-files \
  --nemo-ctc ./sherpa-onnx-nemo-ctc-en-conformer-medium/model.onnx \
--- a/go-api-examples/non-streaming-decode-files/run-paraformer.sh
+++ b/go-api-examples/non-streaming-decode-files/run-paraformer.sh
@@ -5,7 +5,6 @@
 # to download the model
 # before you run this script.
 #
-# You can switch to a different online model if you need

 ./non-streaming-decode-files \
  --paraformer ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \
--- a/go-api-examples/non-streaming-decode-files/run-tdnn-yesno.sh
+++ b/go-api-examples/non-streaming-decode-files/run-tdnn-yesno.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# Please refer to
+# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/yesno/index.html
+# to download the model
+# before you run this script.
+#
+
+./non-streaming-decode-files \
+  --sample-rate=8000 \
+  --feat-dim=23 \
+  --tokens=./sherpa-onnx-tdnn-yesno/tokens.txt \
+  --tdnn-model=./sherpa-onnx-tdnn-yesno/model-epoch-14-avg-2.onnx \
+  ./sherpa-onnx-tdnn-yesno/test_wavs/0_0_0_1_0_0_0_1.wav
--- a/go-api-examples/non-streaming-decode-files/run-transducer.sh
+++ b/go-api-examples/non-streaming-decode-files/run-transducer.sh
@@ -5,7 +5,7 @@
 # to download the model
 # before you run this script.
 #
-# You can switch to a different online model if you need
+# You can switch to a different offline model if you need

 ./non-streaming-decode-files \
  --encoder ./sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx \
--- a/go-api-examples/non-streaming-decode-files/run-whisper.sh
+++ b/go-api-examples/non-streaming-decode-files/run-whisper.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# Please refer to
+# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html
+# to download the model
+# before you run this script.
+#
+# You can switch to a different offline model if you need
+
+./non-streaming-decode-files \
+  --whisper-encoder=./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.onnx \
+  --whisper-decoder=./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.onnx \
+  --tokens=./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \
+  ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav
+
--- a/scripts/dotnet/offline.cs
+++ b/scripts/dotnet/offline.cs
@@ -51,6 +51,32 @@ namespace SherpaOnnx
    public string Model;
  }

+  [StructLayout(LayoutKind.Sequential)]
+  public struct OfflineWhisperModelConfig
+  {
+    public OfflineWhisperModelConfig()
+    {
+      Encoder = "";
+      Decoder = "";
+    }
+    [MarshalAs(UnmanagedType.LPStr)]
+    public string Encoder;
+
+    [MarshalAs(UnmanagedType.LPStr)]
+    public string Decoder;
+  }
+
+  [StructLayout(LayoutKind.Sequential)]
+  public struct OfflineTdnnModelConfig
+  {
+    public OfflineWhisperModelConfig()
+    {
+      Model = "";
+    }
+    [MarshalAs(UnmanagedType.LPStr)]
+    public string Model;
+  }
+
  [StructLayout(LayoutKind.Sequential)]
  public struct OfflineLMConfig
  {
@@ -73,6 +99,8 @@ namespace SherpaOnnx
      Transducer = new OfflineTransducerModelConfig();
      Paraformer = new OfflineParaformerModelConfig();
      NeMoCtc = new OfflineNemoEncDecCtcModelConfig();
+      Whisper = new OfflineWhisperModelConfig();
+      Tdnn = new OfflineTdnnModelConfig();
      Tokens = "";
      NumThreads = 1;
      Debug = 0;
@@ -82,6 +110,8 @@ namespace SherpaOnnx
    public OfflineTransducerModelConfig Transducer;
    public OfflineParaformerModelConfig Paraformer;
    public OfflineNemoEncDecCtcModelConfig NeMoCtc;
+    public OfflineWhisperModelConfig Whisper;
+    public OfflineTdnnModelConfig Tdnn;

    [MarshalAs(UnmanagedType.LPStr)]
    public string Tokens;
--- a/scripts/go/sherpa_onnx.go
+++ b/scripts/go/sherpa_onnx.go
@@ -309,6 +309,15 @@ type OfflineNemoEncDecCtcModelConfig struct {
 	Model string // Path to the model, e.g., model.onnx or model.int8.onnx
 }

+type OfflineWhisperModelConfig struct {
+	Encoder string
+	Decoder string
+}
+
+type OfflineTdnnModelConfig struct {
+	Model string
+}
+
 // Configuration for offline LM.
 type OfflineLMConfig struct {
 	Model string  // Path to the model
@@ -319,6 +328,8 @@ type OfflineModelConfig struct {
 	Transducer OfflineTransducerModelConfig
 	Paraformer OfflineParaformerModelConfig
 	NemoCTC    OfflineNemoEncDecCtcModelConfig
+	Whisper    OfflineWhisperModelConfig
+	Tdnn       OfflineTdnnModelConfig
 	Tokens     string // Path to tokens.txt

 	// Number of threads to use for neural network computation
@@ -390,6 +401,15 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer {
 	c.model_config.nemo_ctc.model = C.CString(config.ModelConfig.NemoCTC.Model)
 	defer C.free(unsafe.Pointer(c.model_config.nemo_ctc.model))

+	c.model_config.whisper.encoder = C.CString(config.ModelConfig.Whisper.Encoder)
+	defer C.free(unsafe.Pointer(c.model_config.whisper.encoder))
+
+	c.model_config.whisper.decoder = C.CString(config.ModelConfig.Whisper.Decoder)
+	defer C.free(unsafe.Pointer(c.model_config.whisper.decoder))
+
+	c.model_config.tdnn.decoder = C.CString(config.ModelConfig.Tdnn.Model)
+	defer C.free(unsafe.Pointer(c.model_config.tdnn.model))
+
 	c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
 	defer C.free(unsafe.Pointer(c.model_config.tokens))

--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -271,6 +271,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
  recognizer_config.model_config.whisper.decoder =
      SHERPA_ONNX_OR(config->model_config.whisper.decoder, "");

+  recognizer_config.model_config.tdnn.model =
+      SHERPA_ONNX_OR(config->model_config.tdnn.model, "");
+
  recognizer_config.model_config.tokens =
      SHERPA_ONNX_OR(config->model_config.tokens, "");
  recognizer_config.model_config.num_threads =
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -305,6 +305,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineWhisperModelConfig {
  const char *decoder;
 } SherpaOnnxOfflineWhisperModelConfig;

+SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTdnnModelConfig {
+  const char *model;
+} SherpaOnnxOfflineTdnnModelConfig;
+
 SHERPA_ONNX_API typedef struct SherpaOnnxOfflineLMConfig {
  const char *model;
  float scale;
@@ -315,6 +319,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
  SherpaOnnxOfflineParaformerModelConfig paraformer;
  SherpaOnnxOfflineNemoEncDecCtcModelConfig nemo_ctc;
  SherpaOnnxOfflineWhisperModelConfig whisper;
+  SherpaOnnxOfflineTdnnModelConfig tdnn;

  const char *tokens;
  int32_t num_threads;