Add C# API for spoken language identification (#697)

2024-03-25 18:45:09 +08:00
parent 83a10a55a5
commit 305c373107
10 changed files with 265 additions and 55 deletions
--- a/.github/scripts/test-dot-net.sh
+++ b/.github/scripts/test-dot-net.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+cd dotnet-examples/
+
+cd spoken-language-identification
+./run.sh
+
+cd ../online-decode-files
+./run-zipformer2-ctc.sh
+./run-transducer.sh
+./run-paraformer.sh
+
+cd ../offline-decode-files
+./run-nemo-ctc.sh
+./run-paraformer.sh
+./run-zipformer.sh
+./run-hotwords.sh
+./run-whisper.sh
+./run-tdnn-yesno.sh
+
+cd ../offline-tts
+./run-aishell3.sh
+./run-piper.sh
+ls -lh
+
+cd ../..
+
+mkdir tts
+
+cp dotnet-examples/offline-tts/*.wav ./tts
--- a/.github/workflows/test-dot-net-nuget.yaml
+++ b/.github/workflows/test-dot-net-nuget.yaml
@@ -40,33 +40,10 @@ jobs:
      - name: Check dotnet
        run: dotnet --info

-      - name: Decode a file
+      - name: Run tests
        shell: bash
        run: |
-          cd dotnet-examples/
-
-          cd online-decode-files
-          ./run-transducer.sh
-          ./run-paraformer.sh
-
-          cd ../offline-decode-files
-          ./run-nemo-ctc.sh
-          ./run-paraformer.sh
-          ./run-zipformer.sh
-          ./run-hotwords.sh
-          ./run-whisper.sh
-          ./run-tdnn-yesno.sh
-
-          cd ../offline-tts
-          ./run-aishell3.sh
-          ./run-piper.sh
-          ls -lh
-
-          cd ../..
-
-          mkdir tts
-
-          cp dotnet-examples/offline-tts/*.wav ./tts
+          .github/scripts/test-dot-net.sh

      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/test-dot-net.yaml
+++ b/.github/workflows/test-dot-net.yaml
@@ -177,39 +177,16 @@ jobs:
          cp -v scripts/dotnet/examples/offline-decode-files.csproj dotnet-examples/offline-decode-files/
          cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/
          cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/
+          cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/

          ls -lh /tmp

-      - name: Decode a file
+      - name: Run tests
        shell: bash
        run: |
-          cd dotnet-examples/
+          .github/scripts/test-dot-net.sh

-          cd online-decode-files
-          ./run-zipformer2-ctc.sh
-          ./run-transducer.sh
-          ./run-paraformer.sh
-
-          cd ../offline-decode-files
-          ./run-nemo-ctc.sh
-          ./run-paraformer.sh
-          ./run-zipformer.sh
-          ./run-hotwords.sh
-          ./run-whisper.sh
-          ./run-tdnn-yesno.sh
-
-          cd ../offline-tts
-          ./run-aishell3.sh
-          ./run-piper.sh
-          ls -lh
-
-          cd ../..
-
-          mkdir tts
-
-          cp dotnet-examples/offline-tts/*.wav ./tts
-
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
          name: dot-net-tts-generated-test-files-${{ matrix.os }}
          path: tts
--- a/dotnet-examples/sherpa-onnx.sln
+++ b/dotnet-examples/sherpa-onnx.sln
@@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts", "offline-tts\
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline-tts-play\offline-tts-play.csproj", "{40781464-5948-462B-BA4B-98932711513F}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -42,5 +44,9 @@ Global
 		{40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.Build.0 = Release|Any CPU
+		{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 EndGlobal
--- a/dotnet-examples/spoken-language-identification/Program.cs
+++ b/dotnet-examples/spoken-language-identification/Program.cs
@@ -0,0 +1,42 @@
+// Copyright (c)  2024  Xiaomi Corporation
+//
+// This file shows how to do spoken language identification with whisper.
+//
+// 1. Download a whisper multilingual model. We use a tiny model below.
+// Please refer to https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+// to download more models.
+//
+// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+// tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+// rm sherpa-onnx-whisper-tiny.tar.bz2
+//
+// 2. Now run it
+//
+// dotnet run
+
+using SherpaOnnx;
+using System.Collections.Generic;
+using System;
+
+class SpokenLanguageIdentificationDemo
+{
+
+  static void Main(string[] args)
+  {
+    var config = new SpokenLanguageIdentificationConfig();
+    config.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx";
+    config.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx";
+
+    var slid = new SpokenLanguageIdentification(config);
+    var filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav";
+
+    WaveReader waveReader = new WaveReader(filename);
+
+    var s = slid.CreateStream();
+    s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
+    var result = slid.Compute(s);
+    Console.WriteLine($"Filename: {filename}");
+    Console.WriteLine($"Detected language: {result.Lang}");
+  }
+}
+
--- a/dotnet-examples/spoken-language-identification/WaveReader.cs
+++ b/dotnet-examples/spoken-language-identification/WaveReader.cs
@@ -0,0 +1 @@
+../offline-decode-files/WaveReader.cs
--- a/dotnet-examples/spoken-language-identification/run.sh
+++ b/dotnet-examples/spoken-language-identification/run.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+set -ex
+
+if [ ! -d ./sherpa-onnx-whisper-tiny ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
+  tar xvf sherpa-onnx-whisper-tiny.tar.bz2
+  rm sherpa-onnx-whisper-tiny.tar.bz2
+fi
+
+dotnet run
+
--- a/dotnet-examples/spoken-language-identification/spoken-language-identification.csproj
+++ b/dotnet-examples/spoken-language-identification/spoken-language-identification.csproj
@@ -0,0 +1,15 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <RootNamespace>spoken_language_identification</RootNamespace>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
+  </ItemGroup>
+
+</Project>
--- a/scripts/dotnet/examples/spoken-language-identification.csproj
+++ b/scripts/dotnet/examples/spoken-language-identification.csproj
@@ -0,0 +1,19 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <RootNamespace>spoken_language_identification</RootNamespace>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <PropertyGroup>
+    <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
+  </ItemGroup>
+
+</Project>
--- a/scripts/dotnet/offline.cs
+++ b/scripts/dotnet/offline.cs
@@ -403,8 +403,8 @@ namespace SherpaOnnx
        while (*buffer != 0)
        {
          ++buffer;
+          length += 1;
        }
-        length = (int)(buffer - (byte*)impl.Text);
      }

      byte[] stringBuffer = new byte[length];
@@ -496,8 +496,6 @@ namespace SherpaOnnx
      return new OfflineStream(p);
    }

-    /// You have to ensure that IsReady(stream) returns true before
-    /// you call this method
    public void Decode(OfflineStream stream)
    {
      Decode(_handle.Handle, stream.Handle);
@@ -549,4 +547,137 @@ namespace SherpaOnnx
    private static extern void Decode(IntPtr handle, IntPtr[] streams, int n);
  }

+  [StructLayout(LayoutKind.Sequential)]
+  public struct SpokenLanguageIdentificationWhisperConfig
+  {
+    public SpokenLanguageIdentificationWhisperConfig()
+    {
+      Encoder = "";
+      Decoder = "";
+      TailPaddings = -1;
+    }
+
+    [MarshalAs(UnmanagedType.LPStr)]
+    public string Encoder;
+
+    [MarshalAs(UnmanagedType.LPStr)]
+    public string Decoder;
+
+    public int TailPaddings;
+  }
+
+  public struct SpokenLanguageIdentificationConfig
+  {
+    public SpokenLanguageIdentificationConfig()
+    {
+      Whisper = new SpokenLanguageIdentificationWhisperConfig();
+      NumThreads = 1;
+      Debug = 0;
+      Provider = "cpu";
+    }
+    public SpokenLanguageIdentificationWhisperConfig Whisper;
+
+    public int NumThreads;
+    public int Debug;
+
+    [MarshalAs(UnmanagedType.LPStr)]
+    public string Provider;
+  }
+
+  public class SpokenLanguageIdentificationResult
+  {
+    public SpokenLanguageIdentificationResult(IntPtr handle)
+    {
+      Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl));
+
+      // PtrToStringUTF8() requires .net standard 2.1
+      // _text = Marshal.PtrToStringUTF8(impl.Text);
+
+      int length = 0;
+
+      unsafe
+      {
+        byte* buffer = (byte*)impl.Lang;
+        while (*buffer != 0)
+        {
+          ++buffer;
+          length += 1;
+        }
+      }
+
+      byte[] stringBuffer = new byte[length];
+      Marshal.Copy(impl.Lang, stringBuffer, 0, length);
+      _lang = Encoding.UTF8.GetString(stringBuffer);
+    }
+
+    [StructLayout(LayoutKind.Sequential)]
+    struct Impl
+    {
+      public IntPtr Lang;
+    }
+
+    private String _lang;
+    public String Lang => _lang;
+  }
+
+  public class SpokenLanguageIdentification : IDisposable
+  {
+    public SpokenLanguageIdentification(SpokenLanguageIdentificationConfig config)
+    {
+      IntPtr h = SherpaOnnxCreateSpokenLanguageIdentification(ref config);
+      _handle = new HandleRef(this, h);
+    }
+
+    public OfflineStream CreateStream()
+    {
+      IntPtr p = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(_handle.Handle);
+      return new OfflineStream(p);
+    }
+
+    public SpokenLanguageIdentificationResult Compute(OfflineStream stream)
+    {
+      IntPtr h = SherpaOnnxSpokenLanguageIdentificationCompute(_handle.Handle, stream.Handle);
+      SpokenLanguageIdentificationResult result = new SpokenLanguageIdentificationResult(h);
+      SherpaOnnxDestroySpokenLanguageIdentificationResult(h);
+      return result;
+    }
+
+    public void Dispose()
+    {
+      Cleanup();
+      // Prevent the object from being placed on the
+      // finalization queue
+      System.GC.SuppressFinalize(this);
+    }
+
+    ~SpokenLanguageIdentification()
+    {
+      Cleanup();
+    }
+
+    private void Cleanup()
+    {
+      SherpaOnnxDestroySpokenLanguageIdentification(_handle.Handle);
+
+      // Don't permit the handle to be used again.
+      _handle = new HandleRef(this, IntPtr.Zero);
+    }
+
+    private HandleRef _handle;
+
+    [DllImport(Dll.Filename)]
+    private static extern IntPtr SherpaOnnxCreateSpokenLanguageIdentification(ref SpokenLanguageIdentificationConfig config);
+
+    [DllImport(Dll.Filename)]
+    private static extern void SherpaOnnxDestroySpokenLanguageIdentification(IntPtr handle);
+
+    [DllImport(Dll.Filename)]
+    private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(IntPtr handle);
+
+    [DllImport(Dll.Filename)]
+    private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCompute(IntPtr handle, IntPtr stream);
+
+    [DllImport(Dll.Filename)]
+    private static extern void SherpaOnnxDestroySpokenLanguageIdentificationResult(IntPtr handle);
+  }
 }