From d3e27d5e21a5f9c862a8c09aaf784da0b6feb727 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 11 Mar 2025 18:58:17 +0800 Subject: [PATCH] Add C# API for speech enhancement GTCRN models (#1990) --- .github/scripts/test-dot-net.sh | 6 +- dotnet-examples/sherpa-onnx.sln | 6 ++ .../speech-enhancement-gtcrn/Program.cs | 45 +++++++++ .../speech-enhancement-gtcrn/run.sh | 12 +++ .../speech-enhancement-gtcrn.csproj | 15 +++ scripts/dotnet/DenoisedAudio.cs | 94 +++++++++++++++++++ scripts/dotnet/OfflineSpeechDenoiser.cs | 64 +++++++++++++ scripts/dotnet/OfflineSpeechDenoiserConfig.cs | 16 ++++ .../OfflineSpeechDenoiserGtcrnModelConfig.cs | 17 ++++ .../OfflineSpeechDenoiserModelConfig.cs | 27 ++++++ 10 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 dotnet-examples/speech-enhancement-gtcrn/Program.cs create mode 100755 dotnet-examples/speech-enhancement-gtcrn/run.sh create mode 100644 dotnet-examples/speech-enhancement-gtcrn/speech-enhancement-gtcrn.csproj create mode 100644 scripts/dotnet/DenoisedAudio.cs create mode 100644 scripts/dotnet/OfflineSpeechDenoiser.cs create mode 100644 scripts/dotnet/OfflineSpeechDenoiserConfig.cs create mode 100644 scripts/dotnet/OfflineSpeechDenoiserGtcrnModelConfig.cs create mode 100644 scripts/dotnet/OfflineSpeechDenoiserModelConfig.cs diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 9ee8b9cc..c67db9d1 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,11 @@ cd dotnet-examples/ -cd ./kokoro-tts +cd ./speech-enhancement-gtcrn +./run.sh +ls -lh + +cd ../kokoro-tts ./run-kokoro.sh ls -lh diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index 404c4976..0c8e24ab 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -35,6 +35,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts", "kokoro-tts\ko EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "kokoro-tts-play", "kokoro-tts-play\kokoro-tts-play.csproj", "{EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "speech-enhancement-gtcrn", "speech-enhancement-gtcrn\speech-enhancement-gtcrn.csproj", "{DF2569C6-6011-4716-9538-F9E9069E00EB}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -105,6 +107,10 @@ Global {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Debug|Any CPU.Build.0 = Debug|Any CPU {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.ActiveCfg = Release|Any CPU {EC0BCEAB-1B4E-4129-82CE-9880426AFA0B}.Release|Any CPU.Build.0 = Release|Any CPU + {DF2569C6-6011-4716-9538-F9E9069E00EB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {DF2569C6-6011-4716-9538-F9E9069E00EB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {DF2569C6-6011-4716-9538-F9E9069E00EB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {DF2569C6-6011-4716-9538-F9E9069E00EB}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/dotnet-examples/speech-enhancement-gtcrn/Program.cs b/dotnet-examples/speech-enhancement-gtcrn/Program.cs new file mode 100644 index 00000000..4553a6f1 --- /dev/null +++ b/dotnet-examples/speech-enhancement-gtcrn/Program.cs @@ -0,0 +1,45 @@ +// Copyright (c) 2025 Xiaomi Corporation +// +// This file shows how to use speech enhancement API with GTCRN models. +// +// 1. Download a model from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/speech-enhancement-models +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx +// +// 2. Download a test file +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav +// +// 3. Now run it +// +// dotnet run + +using SherpaOnnx; + +class OfflineSpeechEnhancementDemo +{ + static void Main(string[] args) + { + var config = new OfflineSpeechDenoiserConfig(); + config.Model.Gtcrn.Model = "./gtcrn_simple.onnx"; + config.Model.Debug = 1; + config.Model.NumThreads = 1; + var sd = new OfflineSpeechDenoiser(config); + + WaveReader waveReader = new WaveReader("./inp_16k.wav"); + var denoisedAudio = sd.Run(waveReader.Samples, waveReader.SampleRate); + + var outputFilename = "./enhanced-16k.wav"; + var ok = denoisedAudio.SaveToWaveFile(outputFilename); + + if (ok) + { + Console.WriteLine($"Wrote to {outputFilename} succeeded!"); + } + else + { + Console.WriteLine($"Failed to write {outputFilename}"); + } + } +} diff --git a/dotnet-examples/speech-enhancement-gtcrn/run.sh b/dotnet-examples/speech-enhancement-gtcrn/run.sh new file mode 100755 index 00000000..788525cd --- /dev/null +++ b/dotnet-examples/speech-enhancement-gtcrn/run.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -ex + +if [ ! -f ./gtcrn_simple.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx +fi + +if [ ! -f ./inp_16k.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav +fi + +dotnet run diff --git a/dotnet-examples/speech-enhancement-gtcrn/speech-enhancement-gtcrn.csproj b/dotnet-examples/speech-enhancement-gtcrn/speech-enhancement-gtcrn.csproj new file mode 100644 index 00000000..a7adcc5e --- /dev/null +++ b/dotnet-examples/speech-enhancement-gtcrn/speech-enhancement-gtcrn.csproj @@ -0,0 +1,15 @@ + + + + Exe + net8.0 + speech_enhancement_gtcrn + enable + enable + + + + + + + diff --git a/scripts/dotnet/DenoisedAudio.cs b/scripts/dotnet/DenoisedAudio.cs new file mode 100644 index 00000000..4ec4ecdd --- /dev/null +++ b/scripts/dotnet/DenoisedAudio.cs @@ -0,0 +1,94 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) +using System; +using System.Runtime.InteropServices; +using System.Text; + +namespace SherpaOnnx +{ + public class DenoisedAudio + { + public DenoisedAudio(IntPtr p) + { + _handle = new HandleRef(this, p); + } + + public bool SaveToWaveFile(String filename) + { + Impl impl = (Impl)Marshal.PtrToStructure(Handle, typeof(Impl)); + byte[] utf8Filename = Encoding.UTF8.GetBytes(filename); + byte[] utf8FilenameWithNull = new byte[utf8Filename.Length + 1]; // +1 for null terminator + Array.Copy(utf8Filename, utf8FilenameWithNull, utf8Filename.Length); + utf8FilenameWithNull[utf8Filename.Length] = 0; // Null terminator + int status = SherpaOnnxWriteWave(impl.Samples, impl.NumSamples, impl.SampleRate, utf8FilenameWithNull); + return status == 1; + } + + ~DenoisedAudio() + { + Cleanup(); + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + private void Cleanup() + { + SherpaOnnxDestroyDenoisedAudio(Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + [StructLayout(LayoutKind.Sequential)] + struct Impl + { + public IntPtr Samples; + public int NumSamples; + public int SampleRate; + } + + private HandleRef _handle; + public IntPtr Handle => _handle.Handle; + + public int NumSamples + { + get + { + Impl impl = (Impl)Marshal.PtrToStructure(Handle, typeof(Impl)); + return impl.NumSamples; + } + } + + public int SampleRate + { + get + { + Impl impl = (Impl)Marshal.PtrToStructure(Handle, typeof(Impl)); + return impl.SampleRate; + } + } + + public float[] Samples + { + get + { + Impl impl = (Impl)Marshal.PtrToStructure(Handle, typeof(Impl)); + + float[] samples = new float[impl.NumSamples]; + Marshal.Copy(impl.Samples, samples, 0, impl.NumSamples); + return samples; + } + } + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroyDenoisedAudio(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxWriteWave(IntPtr samples, int n, int sample_rate, [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I1)] byte[] utf8Filename); + } +} diff --git a/scripts/dotnet/OfflineSpeechDenoiser.cs b/scripts/dotnet/OfflineSpeechDenoiser.cs new file mode 100644 index 00000000..429e2924 --- /dev/null +++ b/scripts/dotnet/OfflineSpeechDenoiser.cs @@ -0,0 +1,64 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + public class OfflineSpeechDenoiser: IDisposable + { + public OfflineSpeechDenoiser(OfflineSpeechDenoiserConfig config) + { + IntPtr h = SherpaOnnxCreateOfflineSpeechDenoiser(ref config); + _handle = new HandleRef(this, h); + } + + public DenoisedAudio Run(float[] samples, int sampleRate) + { + IntPtr p = SherpaOnnxOfflineSpeechDenoiserRun(_handle.Handle, samples, samples.Length, sampleRate); + return new DenoisedAudio(p); + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + ~OfflineSpeechDenoiser() + { + Cleanup(); + } + + private void Cleanup() + { + SherpaOnnxDestroyOfflineSpeechDenoiser(_handle.Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + private HandleRef _handle; + + public int SampleRate + { + get + { + return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(_handle.Handle); + } + } + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCreateOfflineSpeechDenoiser(ref OfflineSpeechDenoiserConfig config); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroyOfflineSpeechDenoiser(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxOfflineSpeechDenoiserGetSampleRate(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxOfflineSpeechDenoiserRun(IntPtr handle, float[] samples, int n, int sampleRate); + } +} diff --git a/scripts/dotnet/OfflineSpeechDenoiserConfig.cs b/scripts/dotnet/OfflineSpeechDenoiserConfig.cs new file mode 100644 index 00000000..546fe9c8 --- /dev/null +++ b/scripts/dotnet/OfflineSpeechDenoiserConfig.cs @@ -0,0 +1,16 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct OfflineSpeechDenoiserConfig + { + public OfflineSpeechDenoiserConfig() + { + Model = new OfflineSpeechDenoiserModelConfig(); + } + public OfflineSpeechDenoiserModelConfig Model; + } +} diff --git a/scripts/dotnet/OfflineSpeechDenoiserGtcrnModelConfig.cs b/scripts/dotnet/OfflineSpeechDenoiserGtcrnModelConfig.cs new file mode 100644 index 00000000..8a815d0b --- /dev/null +++ b/scripts/dotnet/OfflineSpeechDenoiserGtcrnModelConfig.cs @@ -0,0 +1,17 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct OfflineSpeechDenoiserGtcrnModelConfig + { + public OfflineSpeechDenoiserGtcrnModelConfig() + { + Model = ""; + } + [MarshalAs(UnmanagedType.LPStr)] + public string Model; + } +} diff --git a/scripts/dotnet/OfflineSpeechDenoiserModelConfig.cs b/scripts/dotnet/OfflineSpeechDenoiserModelConfig.cs new file mode 100644 index 00000000..40d4d101 --- /dev/null +++ b/scripts/dotnet/OfflineSpeechDenoiserModelConfig.cs @@ -0,0 +1,27 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct OfflineSpeechDenoiserModelConfig + { + public OfflineSpeechDenoiserModelConfig() + { + Gtcrn = new OfflineSpeechDenoiserGtcrnModelConfig(); + NumThreads = 1; + Debug = 0; + Provider = "cpu"; + } + + public OfflineSpeechDenoiserGtcrnModelConfig Gtcrn; + + public int NumThreads; + + public int Debug; + + [MarshalAs(UnmanagedType.LPStr)] + public string Provider; + } +}