diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 70dc4fb7..1843cdf4 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,10 @@ cd dotnet-examples/ -cd offline-punctuation +cd vad-non-streaming-asr-paraformer +./run.sh + +cd ../offline-punctuation ./run.sh cd ../speaker-identification diff --git a/.github/workflows/test-dot-net.yaml b/.github/workflows/test-dot-net.yaml index 500d9e02..8f7c9973 100644 --- a/.github/workflows/test-dot-net.yaml +++ b/.github/workflows/test-dot-net.yaml @@ -67,7 +67,7 @@ jobs: -DCMAKE_BUILD_TYPE=Release \ -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ -DBUILD_ESPEAK_NG_EXE=OFF \ - -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DSHERPA_ONNX_ENABLE_BINARY=ON \ .. cmake --build . --target install --config Release @@ -197,6 +197,7 @@ jobs: cp -v scripts/dotnet/examples/streaming-hlg-decoding.csproj dotnet-examples/streaming-hlg-decoding cp -v scripts/dotnet/examples/speaker-identification.csproj dotnet-examples/speaker-identification cp -v scripts/dotnet/examples/offline-punctuation.csproj dotnet-examples/offline-punctuation + cp -v scripts/dotnet/examples/vad-non-streaming-asr-paraformer.csproj dotnet-examples/vad-non-streaming-asr-paraformer ls -lh /tmp diff --git a/dotnet-examples/offline-punctuation/Program.cs b/dotnet-examples/offline-punctuation/Program.cs index 83a54fea..d20ff105 100644 --- a/dotnet-examples/offline-punctuation/Program.cs +++ b/dotnet-examples/offline-punctuation/Program.cs @@ -17,7 +17,6 @@ using System; class OfflinePunctuationDemo { - static void Main(string[] args) { var config = new OfflinePunctuationConfig(); @@ -42,4 +41,3 @@ class OfflinePunctuationDemo } } } - diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index fae0af92..c2685180 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -21,6 +21,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speaker-identification", "s EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-punctuation", "offline-punctuation\offline-punctuation.csproj", "{42D85582-BB63-4259-A4EA-837D66AC078B}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "vad-non-streaming-asr-paraformer", "vad-non-streaming-asr-paraformer\vad-non-streaming-asr-paraformer.csproj", "{8CD6B7E5-F59F-47B3-BB87-2B2E3678924D}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -66,5 +68,9 @@ Global {42D85582-BB63-4259-A4EA-837D66AC078B}.Debug|Any CPU.Build.0 = Debug|Any CPU {42D85582-BB63-4259-A4EA-837D66AC078B}.Release|Any CPU.ActiveCfg = Release|Any CPU {42D85582-BB63-4259-A4EA-837D66AC078B}.Release|Any CPU.Build.0 = Release|Any CPU + {8CD6B7E5-F59F-47B3-BB87-2B2E3678924D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {8CD6B7E5-F59F-47B3-BB87-2B2E3678924D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {8CD6B7E5-F59F-47B3-BB87-2B2E3678924D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {8CD6B7E5-F59F-47B3-BB87-2B2E3678924D}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection EndGlobal diff --git a/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs b/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs new file mode 100644 index 00000000..8471c024 --- /dev/null +++ b/dotnet-examples/vad-non-streaming-asr-paraformer/Program.cs @@ -0,0 +1,62 @@ +// Copyright (c) 2024 Xiaomi Corporation +// +// This file shows how to use a silero_vad model with a non-streaming Paraformer +// for speech recognition. +using SherpaOnnx; +using System.Collections.Generic; +using System; + +class VadNonStreamingAsrParaformer +{ + static void Main(string[] args) + { + // please download model files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + OfflineRecognizerConfig config = new OfflineRecognizerConfig(); + config.ModelConfig.Paraformer.Model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx"; + config.ModelConfig.Tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt"; + config.ModelConfig.Debug = 0; + OfflineRecognizer recognizer = new OfflineRecognizer(config); + + VadModelConfig vadModelConfig = new VadModelConfig(); + vadModelConfig.SileroVad.Model = "./silero_vad.onnx"; + vadModelConfig.Debug = 0; + + VoiceActivityDetector vad = new VoiceActivityDetector(vadModelConfig, 60); + + string testWaveFilename = "./lei-jun-test.wav"; + WaveReader reader = new WaveReader(testWaveFilename); + + int numSamples = reader.Samples.Length; + int windowSize = vadModelConfig.SileroVad.WindowSize; + int sampleRate = vadModelConfig.SampleRate; + int numIter = numSamples / windowSize; + + for (int i = 0; i != numIter; ++i) { + int start = i * windowSize; + float[] samples = new float[windowSize]; + Array.Copy(reader.Samples, start, samples, 0, windowSize); + vad.AcceptWaveform(samples); + if (vad.IsSpeechDetected()) { + while (!vad.IsEmpty()) { + SpeechSegment segment = vad.Front(); + float startTime = segment.Start / (float)sampleRate; + float duration = segment.Samples.Length / (float)sampleRate; + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sampleRate, segment.Samples); + recognizer.Decode(stream); + String text = stream.Result.Text; + + if (!String.IsNullOrEmpty(text)) { + Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime), + String.Format("{0:0.00}", startTime+duration), text); + } + + vad.Pop(); + } + } + } + } +} + diff --git a/dotnet-examples/vad-non-streaming-asr-paraformer/WaveReader.cs b/dotnet-examples/vad-non-streaming-asr-paraformer/WaveReader.cs new file mode 120000 index 00000000..bedfc634 --- /dev/null +++ b/dotnet-examples/vad-non-streaming-asr-paraformer/WaveReader.cs @@ -0,0 +1 @@ +../online-decode-files/WaveReader.cs \ No newline at end of file diff --git a/dotnet-examples/vad-non-streaming-asr-paraformer/run.sh b/dotnet-examples/vad-non-streaming-asr-paraformer/run.sh new file mode 100755 index 00000000..cb8ca87f --- /dev/null +++ b/dotnet-examples/vad-non-streaming-asr-paraformer/run.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./silero_vad.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +dotnet run diff --git a/dotnet-examples/vad-non-streaming-asr-paraformer/vad-non-streaming-asr-paraformer.csproj b/dotnet-examples/vad-non-streaming-asr-paraformer/vad-non-streaming-asr-paraformer.csproj new file mode 100644 index 00000000..3a957bcf --- /dev/null +++ b/dotnet-examples/vad-non-streaming-asr-paraformer/vad-non-streaming-asr-paraformer.csproj @@ -0,0 +1,15 @@ + + + + Exe + net6.0 + vad_non_streaming_asr_paraformer + enable + enable + + + + + + + diff --git a/java-api-examples/VadNonStreamingParaformer.java b/java-api-examples/VadNonStreamingParaformer.java index 48e446ae..61c2b53d 100644 --- a/java-api-examples/VadNonStreamingParaformer.java +++ b/java-api-examples/VadNonStreamingParaformer.java @@ -39,10 +39,6 @@ public class VadNonStreamingParaformer { String model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx"; String tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt"; - String waveFilename = "./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav"; - - WaveReader reader = new WaveReader(waveFilename); - OfflineParaformerModelConfig paraformer = OfflineParaformerModelConfig.builder().setModel(model).build(); diff --git a/scripts/dotnet/CircularBuffer.cs b/scripts/dotnet/CircularBuffer.cs new file mode 100644 index 00000000..9a507123 --- /dev/null +++ b/scripts/dotnet/CircularBuffer.cs @@ -0,0 +1,112 @@ +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Linq; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; +using System; + +namespace SherpaOnnx +{ + public class CircularBuffer : IDisposable + { + public CircularBuffer(int capacity) + { + IntPtr h = SherpaOnnxCreateCircularBuffer(capacity); + _handle = new HandleRef(this, h); + } + + public void Push(float[] data) + { + SherpaOnnxCircularBufferPush(_handle.Handle, data, data.Length); + } + + public float[] Get(int startIndex, int n) + { + IntPtr p = SherpaOnnxCircularBufferGet(_handle.Handle, startIndex, n); + + float[] ans = new float[n]; + Marshal.Copy(p, ans, 0, n); + + SherpaOnnxCircularBufferFree(p); + + return ans; + } + + public void Pop(int n) + { + SherpaOnnxCircularBufferPop(_handle.Handle, n); + } + + public int Size + { + get + { + return SherpaOnnxCircularBufferSize(_handle.Handle); + } + } + + public int Head + { + get + { + return SherpaOnnxCircularBufferHead(_handle.Handle); + } + } + + public void Reset() + { + SherpaOnnxCircularBufferReset(_handle.Handle); + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + ~CircularBuffer() + { + Cleanup(); + } + + private void Cleanup() + { + SherpaOnnxDestroyCircularBuffer(_handle.Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + private HandleRef _handle; + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCreateCircularBuffer(int capacity); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroyCircularBuffer(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxCircularBufferPush(IntPtr handle, float[] p, int n); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCircularBufferGet(IntPtr handle, int startIndex, int n); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxCircularBufferFree(IntPtr p); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxCircularBufferPop(IntPtr handle, int n); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxCircularBufferSize(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxCircularBufferHead(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxCircularBufferReset(IntPtr handle); + } +} diff --git a/scripts/dotnet/SileroVadModelConfig.cs b/scripts/dotnet/SileroVadModelConfig.cs new file mode 100644 index 00000000..2b02672f --- /dev/null +++ b/scripts/dotnet/SileroVadModelConfig.cs @@ -0,0 +1,34 @@ +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Linq; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; +using System; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct SileroVadModelConfig + { + public SileroVadModelConfig() + { + Model = ""; + Threshold = 0.5F; + MinSilenceDuration = 0.5F; + MinSpeechDuration = 0.25F; + WindowSize = 512; + } + + [MarshalAs(UnmanagedType.LPStr)] + public string Model; + + public float Threshold; + + public float MinSilenceDuration; + + public float MinSpeechDuration; + + public int WindowSize; + } +} diff --git a/scripts/dotnet/SpeechSegment.cs b/scripts/dotnet/SpeechSegment.cs new file mode 100644 index 00000000..1128e705 --- /dev/null +++ b/scripts/dotnet/SpeechSegment.cs @@ -0,0 +1,47 @@ +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Linq; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; +using System; + +namespace SherpaOnnx +{ + public class SpeechSegment + { + public SpeechSegment(IntPtr handle) + { + Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl)); + + _start = impl.Start; + + unsafe + { + float* t = (float*)impl.Samples; + _samples = new float[impl.Count]; + fixed (float* pTarget = _samples) + { + for (int i = 0; i < impl.Count; i++) + { + pTarget[i] = t[i]; + } + } + } + } + + public int _start; + public int Start => _start; + + private float[] _samples; + public float[] Samples => _samples; + + [StructLayout(LayoutKind.Sequential)] + struct Impl + { + public int Start; + public IntPtr Samples; + public int Count; + } + } +} diff --git a/scripts/dotnet/VadModelConfig.cs b/scripts/dotnet/VadModelConfig.cs new file mode 100644 index 00000000..87fca71d --- /dev/null +++ b/scripts/dotnet/VadModelConfig.cs @@ -0,0 +1,35 @@ +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Linq; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; +using System; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct VadModelConfig + { + public VadModelConfig() + { + SileroVad = new SileroVadModelConfig(); + SampleRate = 16000; + NumThreads = 1; + Provider = "cpu"; + Debug = 0; + } + + public SileroVadModelConfig SileroVad; + + public int SampleRate; + + public int NumThreads; + + [MarshalAs(UnmanagedType.LPStr)] + public string Provider; + + public int Debug; + } +} + diff --git a/scripts/dotnet/VoiceActivityDetector.cs b/scripts/dotnet/VoiceActivityDetector.cs new file mode 100644 index 00000000..44ecc2aa --- /dev/null +++ b/scripts/dotnet/VoiceActivityDetector.cs @@ -0,0 +1,115 @@ +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Linq; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; +using System; + +namespace SherpaOnnx +{ + public class VoiceActivityDetector : IDisposable + { + public VoiceActivityDetector(VadModelConfig config, float bufferSizeInSeconds) + { + IntPtr h = SherpaOnnxCreateVoiceActivityDetector(ref config, bufferSizeInSeconds); + _handle = new HandleRef(this, h); + } + + public void AcceptWaveform(float[] samples) + { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(_handle.Handle, samples, samples.Length); + } + + public bool IsEmpty() + { + return SherpaOnnxVoiceActivityDetectorEmpty(_handle.Handle) == 1; + } + + public bool IsSpeechDetected() + { + return SherpaOnnxVoiceActivityDetectorDetected(_handle.Handle) == 1; + } + + public void Pop() + { + SherpaOnnxVoiceActivityDetectorPop(_handle.Handle); + } + + public SpeechSegment Front() + { + IntPtr p = SherpaOnnxVoiceActivityDetectorFront(_handle.Handle); + + SpeechSegment segment = new SpeechSegment(p); + + SherpaOnnxDestroySpeechSegment(p); + + return segment; + } + + public void Clear() + { + SherpaOnnxVoiceActivityDetectorClear(_handle.Handle); + } + + public void Reset() + { + SherpaOnnxVoiceActivityDetectorReset(_handle.Handle); + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + ~VoiceActivityDetector() + { + Cleanup(); + } + + private void Cleanup() + { + SherpaOnnxDestroyVoiceActivityDetector(_handle.Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + private HandleRef _handle; + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCreateVoiceActivityDetector(ref VadModelConfig config, float bufferSizeInSeconds); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroyVoiceActivityDetector(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxVoiceActivityDetectorAcceptWaveform(IntPtr handle, float[] samples, int n); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxVoiceActivityDetectorEmpty(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxVoiceActivityDetectorDetected(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxVoiceActivityDetectorPop(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxVoiceActivityDetectorClear(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxVoiceActivityDetectorFront(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroySpeechSegment(IntPtr segment); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxVoiceActivityDetectorReset(IntPtr handle); + + } +} + diff --git a/scripts/dotnet/examples/vad-non-streaming-asr-paraformer.csproj b/scripts/dotnet/examples/vad-non-streaming-asr-paraformer.csproj new file mode 100644 index 00000000..4870735f --- /dev/null +++ b/scripts/dotnet/examples/vad-non-streaming-asr-paraformer.csproj @@ -0,0 +1,19 @@ + + + + Exe + net6.0 + vad_non_streaming_asr_paraformer + enable + enable + + + + /tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json + + + + + + + diff --git a/scripts/dotnet/sherpa-onnx.csproj.in b/scripts/dotnet/sherpa-onnx.csproj.in index a6f83a64..60ae4187 100644 --- a/scripts/dotnet/sherpa-onnx.csproj.in +++ b/scripts/dotnet/sherpa-onnx.csproj.in @@ -4,7 +4,7 @@ README.md Library 10.0 - netstandard2.0;netcoreapp3.1;net6.0;net7.0 + netstandard2.0 linux-x64;osx-x64;win-x64 true sherpa-onnx diff --git a/scripts/dotnet/sherpa-onnx.csproj.runtime.in b/scripts/dotnet/sherpa-onnx.csproj.runtime.in index 335254e1..f90ae2c3 100644 --- a/scripts/dotnet/sherpa-onnx.csproj.runtime.in +++ b/scripts/dotnet/sherpa-onnx.csproj.runtime.in @@ -3,7 +3,7 @@ Apache-2.0 README.md Library - netstandard2.0;netcoreapp3.1;net6.0;net7.0 + netstandard2.0 {{ dotnet_rid }} sherpa-onnx {{ version }}