diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh new file mode 100755 index 00000000..c5c6d5a4 --- /dev/null +++ b/.github/scripts/test-dot-net.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +cd dotnet-examples/ + +cd spoken-language-identification +./run.sh + +cd ../online-decode-files +./run-zipformer2-ctc.sh +./run-transducer.sh +./run-paraformer.sh + +cd ../offline-decode-files +./run-nemo-ctc.sh +./run-paraformer.sh +./run-zipformer.sh +./run-hotwords.sh +./run-whisper.sh +./run-tdnn-yesno.sh + +cd ../offline-tts +./run-aishell3.sh +./run-piper.sh +ls -lh + +cd ../.. + +mkdir tts + +cp dotnet-examples/offline-tts/*.wav ./tts diff --git a/.github/workflows/test-dot-net-nuget.yaml b/.github/workflows/test-dot-net-nuget.yaml index 4af976b2..0e7f21b1 100644 --- a/.github/workflows/test-dot-net-nuget.yaml +++ b/.github/workflows/test-dot-net-nuget.yaml @@ -40,33 +40,10 @@ jobs: - name: Check dotnet run: dotnet --info - - name: Decode a file + - name: Run tests shell: bash run: | - cd dotnet-examples/ - - cd online-decode-files - ./run-transducer.sh - ./run-paraformer.sh - - cd ../offline-decode-files - ./run-nemo-ctc.sh - ./run-paraformer.sh - ./run-zipformer.sh - ./run-hotwords.sh - ./run-whisper.sh - ./run-tdnn-yesno.sh - - cd ../offline-tts - ./run-aishell3.sh - ./run-piper.sh - ls -lh - - cd ../.. - - mkdir tts - - cp dotnet-examples/offline-tts/*.wav ./tts + .github/scripts/test-dot-net.sh - uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-dot-net.yaml b/.github/workflows/test-dot-net.yaml index f47c838f..aa8e7b1e 100644 --- a/.github/workflows/test-dot-net.yaml +++ b/.github/workflows/test-dot-net.yaml @@ -177,39 +177,16 @@ jobs: cp -v scripts/dotnet/examples/offline-decode-files.csproj dotnet-examples/offline-decode-files/ cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/ cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/ + cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/ ls -lh /tmp - - name: Decode a file + - name: Run tests shell: bash run: | - cd dotnet-examples/ + .github/scripts/test-dot-net.sh - cd online-decode-files - ./run-zipformer2-ctc.sh - ./run-transducer.sh - ./run-paraformer.sh - - cd ../offline-decode-files - ./run-nemo-ctc.sh - ./run-paraformer.sh - ./run-zipformer.sh - ./run-hotwords.sh - ./run-whisper.sh - ./run-tdnn-yesno.sh - - cd ../offline-tts - ./run-aishell3.sh - ./run-piper.sh - ls -lh - - cd ../.. - - mkdir tts - - cp dotnet-examples/offline-tts/*.wav ./tts - - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: dot-net-tts-generated-test-files-${{ matrix.os }} path: tts diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index a70405e4..6c469ba3 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts", "offline-tts\ EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline-tts-play\offline-tts-play.csproj", "{40781464-5948-462B-BA4B-98932711513F}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -42,5 +44,9 @@ Global {40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.Build.0 = Debug|Any CPU {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.ActiveCfg = Release|Any CPU {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.Build.0 = Release|Any CPU + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/dotnet-examples/spoken-language-identification/Program.cs b/dotnet-examples/spoken-language-identification/Program.cs new file mode 100644 index 00000000..05a785d7 --- /dev/null +++ b/dotnet-examples/spoken-language-identification/Program.cs @@ -0,0 +1,42 @@ +// Copyright (c) 2024 Xiaomi Corporation +// +// This file shows how to do spoken language identification with whisper. +// +// 1. Download a whisper multilingual model. We use a tiny model below. +// Please refer to https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +// to download more models. +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 +// tar xvf sherpa-onnx-whisper-tiny.tar.bz2 +// rm sherpa-onnx-whisper-tiny.tar.bz2 +// +// 2. Now run it +// +// dotnet run + +using SherpaOnnx; +using System.Collections.Generic; +using System; + +class SpokenLanguageIdentificationDemo +{ + + static void Main(string[] args) + { + var config = new SpokenLanguageIdentificationConfig(); + config.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx"; + config.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx"; + + var slid = new SpokenLanguageIdentification(config); + var filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav"; + + WaveReader waveReader = new WaveReader(filename); + + var s = slid.CreateStream(); + s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples); + var result = slid.Compute(s); + Console.WriteLine($"Filename: {filename}"); + Console.WriteLine($"Detected language: {result.Lang}"); + } +} + diff --git a/dotnet-examples/spoken-language-identification/WaveReader.cs b/dotnet-examples/spoken-language-identification/WaveReader.cs new file mode 120000 index 00000000..2c5d1679 --- /dev/null +++ b/dotnet-examples/spoken-language-identification/WaveReader.cs @@ -0,0 +1 @@ +../offline-decode-files/WaveReader.cs \ No newline at end of file diff --git a/dotnet-examples/spoken-language-identification/run.sh b/dotnet-examples/spoken-language-identification/run.sh new file mode 100755 index 00000000..3b393d5f --- /dev/null +++ b/dotnet-examples/spoken-language-identification/run.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d ./sherpa-onnx-whisper-tiny ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 + tar xvf sherpa-onnx-whisper-tiny.tar.bz2 + rm sherpa-onnx-whisper-tiny.tar.bz2 +fi + +dotnet run + diff --git a/dotnet-examples/spoken-language-identification/spoken-language-identification.csproj b/dotnet-examples/spoken-language-identification/spoken-language-identification.csproj new file mode 100644 index 00000000..eb8b943e --- /dev/null +++ b/dotnet-examples/spoken-language-identification/spoken-language-identification.csproj @@ -0,0 +1,15 @@ + + + + Exe + net6.0 + spoken_language_identification + enable + enable + + + + + + + diff --git a/scripts/dotnet/examples/spoken-language-identification.csproj b/scripts/dotnet/examples/spoken-language-identification.csproj new file mode 100644 index 00000000..ab38ac7e --- /dev/null +++ b/scripts/dotnet/examples/spoken-language-identification.csproj @@ -0,0 +1,19 @@ + + + + Exe + net6.0 + spoken_language_identification + enable + enable + + + + /tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json + + + + + + + diff --git a/scripts/dotnet/offline.cs b/scripts/dotnet/offline.cs index 4ef2a4a1..1a8612f3 100644 --- a/scripts/dotnet/offline.cs +++ b/scripts/dotnet/offline.cs @@ -403,8 +403,8 @@ namespace SherpaOnnx while (*buffer != 0) { ++buffer; + length += 1; } - length = (int)(buffer - (byte*)impl.Text); } byte[] stringBuffer = new byte[length]; @@ -496,8 +496,6 @@ namespace SherpaOnnx return new OfflineStream(p); } - /// You have to ensure that IsReady(stream) returns true before - /// you call this method public void Decode(OfflineStream stream) { Decode(_handle.Handle, stream.Handle); @@ -549,4 +547,137 @@ namespace SherpaOnnx private static extern void Decode(IntPtr handle, IntPtr[] streams, int n); } + [StructLayout(LayoutKind.Sequential)] + public struct SpokenLanguageIdentificationWhisperConfig + { + public SpokenLanguageIdentificationWhisperConfig() + { + Encoder = ""; + Decoder = ""; + TailPaddings = -1; + } + + [MarshalAs(UnmanagedType.LPStr)] + public string Encoder; + + [MarshalAs(UnmanagedType.LPStr)] + public string Decoder; + + public int TailPaddings; + } + + public struct SpokenLanguageIdentificationConfig + { + public SpokenLanguageIdentificationConfig() + { + Whisper = new SpokenLanguageIdentificationWhisperConfig(); + NumThreads = 1; + Debug = 0; + Provider = "cpu"; + } + public SpokenLanguageIdentificationWhisperConfig Whisper; + + public int NumThreads; + public int Debug; + + [MarshalAs(UnmanagedType.LPStr)] + public string Provider; + } + + public class SpokenLanguageIdentificationResult + { + public SpokenLanguageIdentificationResult(IntPtr handle) + { + Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl)); + + // PtrToStringUTF8() requires .net standard 2.1 + // _text = Marshal.PtrToStringUTF8(impl.Text); + + int length = 0; + + unsafe + { + byte* buffer = (byte*)impl.Lang; + while (*buffer != 0) + { + ++buffer; + length += 1; + } + } + + byte[] stringBuffer = new byte[length]; + Marshal.Copy(impl.Lang, stringBuffer, 0, length); + _lang = Encoding.UTF8.GetString(stringBuffer); + } + + [StructLayout(LayoutKind.Sequential)] + struct Impl + { + public IntPtr Lang; + } + + private String _lang; + public String Lang => _lang; + } + + public class SpokenLanguageIdentification : IDisposable + { + public SpokenLanguageIdentification(SpokenLanguageIdentificationConfig config) + { + IntPtr h = SherpaOnnxCreateSpokenLanguageIdentification(ref config); + _handle = new HandleRef(this, h); + } + + public OfflineStream CreateStream() + { + IntPtr p = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(_handle.Handle); + return new OfflineStream(p); + } + + public SpokenLanguageIdentificationResult Compute(OfflineStream stream) + { + IntPtr h = SherpaOnnxSpokenLanguageIdentificationCompute(_handle.Handle, stream.Handle); + SpokenLanguageIdentificationResult result = new SpokenLanguageIdentificationResult(h); + SherpaOnnxDestroySpokenLanguageIdentificationResult(h); + return result; + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + ~SpokenLanguageIdentification() + { + Cleanup(); + } + + private void Cleanup() + { + SherpaOnnxDestroySpokenLanguageIdentification(_handle.Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + private HandleRef _handle; + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCreateSpokenLanguageIdentification(ref SpokenLanguageIdentificationConfig config); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroySpokenLanguageIdentification(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCompute(IntPtr handle, IntPtr stream); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroySpokenLanguageIdentificationResult(IntPtr handle); + } }