From a45e5dba9986f07f13d4f64b13ff77589a9909e3 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 10 Oct 2024 14:29:05 +0800 Subject: [PATCH] C# API for speaker diarization (#1407) --- .github/scripts/test-dot-net.sh | 8 +- .github/workflows/test-dot-net.yaml | 71 +++------- .../offline-speaker-diarization/Program.cs | 83 ++++++++++++ .../offline-speaker-diarization.csproj | 15 +++ .../offline-speaker-diarization/run.sh | 18 +++ dotnet-examples/sherpa-onnx.sln | 6 + scripts/dotnet/FastClusteringConfig.cs | 20 +++ scripts/dotnet/OfflineSpeakerDiarization.cs | 122 ++++++++++++++++++ .../dotnet/OfflineSpeakerDiarizationConfig.cs | 31 +++++ .../OfflineSpeakerDiarizationSegment.cs | 33 +++++ .../OfflineSpeakerSegmentationModelConfig.cs | 32 +++++ ...eSpeakerSegmentationPyannoteModelConfig.cs | 20 +++ 12 files changed, 408 insertions(+), 51 deletions(-) create mode 100644 dotnet-examples/offline-speaker-diarization/Program.cs create mode 100644 dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj create mode 100755 dotnet-examples/offline-speaker-diarization/run.sh create mode 100644 scripts/dotnet/FastClusteringConfig.cs create mode 100644 scripts/dotnet/OfflineSpeakerDiarization.cs create mode 100644 scripts/dotnet/OfflineSpeakerDiarizationConfig.cs create mode 100644 scripts/dotnet/OfflineSpeakerDiarizationSegment.cs create mode 100644 scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs create mode 100644 scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index c397fc0c..eec3b6bb 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,13 @@ cd dotnet-examples/ -cd ./offline-decode-files +cd ./offline-speaker-diarization +./run.sh +rm -rfv *.onnx +rm -fv *.wav +rm -rfv sherpa-onnx-pyannote-* + +cd ../offline-decode-files ./run-sense-voice-ctc.sh rm -rf sherpa-onnx-* diff --git a/.github/workflows/test-dot-net.yaml b/.github/workflows/test-dot-net.yaml index 6e32b155..d046542b 100644 --- a/.github/workflows/test-dot-net.yaml +++ b/.github/workflows/test-dot-net.yaml @@ -47,53 +47,10 @@ jobs: with: fetch-depth: 0 - - name: Free space - if: matrix.os == 'ubuntu-latest' - shell: bash - run: | - df -h - rm -rf /opt/hostedtoolcache - df -h - - - name: Free more space - if: matrix.os == 'ubuntu-latest' - shell: bash - run: | - # https://github.com/orgs/community/discussions/25678 - cd /opt - find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';' - - sudo rm -rf /usr/share/dotnet - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - - name: Free Disk Space (Ubuntu) - if: matrix.os == 'ubuntu-latest' - uses: jlumbroso/free-disk-space@main - with: - # this might remove tools that are actually needed, - # if set to "true" but frees about 6 GB - tool-cache: false - - # all of these default to true, but feel free to set to - # "false" if necessary for your workflow - android: true - dotnet: false - haskell: true - large-packages: true - docker-images: false - swap-storage: true - - - name: Check space - if: matrix.os == 'ubuntu-latest' - shell: bash - run: | - df -h - - name: ccache uses: hendrikmuhs/ccache-action@v1.2 with: - key: ${{ matrix.os }}-release-shared + key: ${{ matrix.os }}-dotnet-release-shared - name: Build sherpa-onnx shell: bash @@ -110,11 +67,16 @@ jobs: -DCMAKE_BUILD_TYPE=Release \ -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ -DBUILD_ESPEAK_NG_EXE=OFF \ - -DSHERPA_ONNX_ENABLE_BINARY=ON \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ .. cmake --build . --target install --config Release + rm -rf install/share + rm -rf install/lib/pkg* + + ls -lh ./install/lib + - uses: actions/upload-artifact@v4 with: name: ${{ matrix.os }} @@ -148,7 +110,7 @@ jobs: uses: actions/download-artifact@v4 with: name: ubuntu-latest - path: /tmp/linux + path: /tmp/linux-x64 - name: Setup .NET uses: actions/setup-dotnet@v4 @@ -162,17 +124,21 @@ jobs: - name: Display files shell: bash run: | - echo "----------/tmp/----------" - ls -lh /tmp/ + echo "----------/tmp----------" + ls -lh /tmp - echo "----------/tmp/linux----------" - ls -lh /tmp/linux + echo "----------/tmp/linux-x64----------" + ls -lh /tmp/linux-x64 + df -h - name: Build shell: bash run: | cd scripts/dotnet ./run.sh + df -h + + ls -lh /tmp/packages - name: Copy files shell: bash @@ -181,9 +147,14 @@ jobs: ls -lh /tmp + df -h + - name: Run tests shell: bash run: | + dotnet nuget locals all --clear + df -h + .github/scripts/test-dot-net.sh - uses: actions/upload-artifact@v4 diff --git a/dotnet-examples/offline-speaker-diarization/Program.cs b/dotnet-examples/offline-speaker-diarization/Program.cs new file mode 100644 index 00000000..45316fe7 --- /dev/null +++ b/dotnet-examples/offline-speaker-diarization/Program.cs @@ -0,0 +1,83 @@ +// Copyright (c) 2024 Xiaomi Corporation +// + +// This file shows how to use sherpa-onnx C# API for speaker diarization +/* +Usage: + +Step 1: Download a speaker segmentation model + +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models +for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +Step 2: Download a speaker embedding extractor model + +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +Step 3. Download test wave files + +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models +for a list of available test wave files. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +Step 4. Run it + + dotnet run +*/ + +using SherpaOnnx; +using System; + +class OfflineSpeakerDiarizationDemo +{ + static void Main(string[] args) + { + var config = new OfflineSpeakerDiarizationConfig(); + config.Segmentation.Pyannote.Model = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"; + config.Embedding.Model = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; + + // the test wave ./0-four-speakers-zh.wav has 4 speakers, so + // we set num_clusters to 4 + // + config.Clustering.NumClusters = 4; + // If you don't know the number of speakers in the test wave file, please + // use + // config.Clustering.Threshold = 0.5; // You need to tune this threshold + var sd = new OfflineSpeakerDiarization(config); + + var testWaveFile = "./0-four-speakers-zh.wav"; + WaveReader waveReader = new WaveReader(testWaveFile); + if (sd.SampleRate != waveReader.SampleRate) + { + Console.WriteLine($"Expected sample rate: {sd.SampleRate}. Given: {waveReader.SampleRate}"); + return; + } + + Console.WriteLine("Started"); + + // var segments = sd.Process(waveReader.Samples); // this one is also ok + + var MyProgressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) => + { + float progress = 100.0F * numProcessedChunks / numTotalChunks; + Console.WriteLine("Progress {0}%", String.Format("{0:0.00}", progress)); + return 0; + }; + + var callback = new OfflineSpeakerDiarizationProgressCallback(MyProgressCallback); + var segments = sd.ProcessWithCallback(waveReader.Samples, callback, IntPtr.Zero); + + foreach (var s in segments) + { + Console.WriteLine("{0} -- {1} speaker_{2}", String.Format("{0:0.00}", s.Start), String.Format("{0:0.00}", s.End), s.Speaker); + } + } +} diff --git a/dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj b/dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj new file mode 100644 index 00000000..3374dbca --- /dev/null +++ b/dotnet-examples/offline-speaker-diarization/offline-speaker-diarization.csproj @@ -0,0 +1,15 @@ + + + + Exe + net6.0 + offline_speaker_diarization + enable + enable + + + + + + + diff --git a/dotnet-examples/offline-speaker-diarization/run.sh b/dotnet-examples/offline-speaker-diarization/run.sh new file mode 100755 index 00000000..fe64412f --- /dev/null +++ b/dotnet-examples/offline-speaker-diarization/run.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + + +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +fi + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav +fi + +dotnet run diff --git a/dotnet-examples/sherpa-onnx.sln b/dotnet-examples/sherpa-onnx.sln index 397fe99e..0bff03f5 100644 --- a/dotnet-examples/sherpa-onnx.sln +++ b/dotnet-examples/sherpa-onnx.sln @@ -31,6 +31,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-micro EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TTS", "TTS\TTS.csproj", "{DACE4A18-4FC8-4437-92BF-5A90BA81286C}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -93,6 +95,10 @@ Global {DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.Build.0 = Debug|Any CPU {DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.ActiveCfg = Release|Any CPU {DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.Build.0 = Release|Any CPU + {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/scripts/dotnet/FastClusteringConfig.cs b/scripts/dotnet/FastClusteringConfig.cs new file mode 100644 index 00000000..276ef9d8 --- /dev/null +++ b/scripts/dotnet/FastClusteringConfig.cs @@ -0,0 +1,20 @@ +/// Copyright (c) 2024 Xiaomi Corporation + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + + [StructLayout(LayoutKind.Sequential)] + public struct FastClusteringConfig + { + public FastClusteringConfig() + { + NumClusters = -1; + Threshold = 0.5F; + } + + public int NumClusters; + public float Threshold; + } +} diff --git a/scripts/dotnet/OfflineSpeakerDiarization.cs b/scripts/dotnet/OfflineSpeakerDiarization.cs new file mode 100644 index 00000000..b56cab9b --- /dev/null +++ b/scripts/dotnet/OfflineSpeakerDiarization.cs @@ -0,0 +1,122 @@ +/// Copyright (c) 2024 Xiaomi Corporation +using System; +using System.Runtime.InteropServices; +using System.Text; + +namespace SherpaOnnx +{ + // IntPtr is actually a `const float*` from C++ + public delegate int OfflineSpeakerDiarizationProgressCallback(int numProcessedChunks, int numTotalChunks, IntPtr arg); + + public class OfflineSpeakerDiarization : IDisposable + { + public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) + { + IntPtr h = SherpaOnnxCreateOfflineSpeakerDiarization(ref config); + _handle = new HandleRef(this, h); + } + + public OfflineSpeakerDiarizationSegment[] Process(float[] samples) + { + IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcess(_handle.Handle, samples, samples.Length); + return ProcessImpl(result); + } + + public OfflineSpeakerDiarizationSegment[] ProcessWithCallback(float[] samples, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg) + { + IntPtr result = SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(_handle.Handle, samples, samples.Length, callback, arg); + return ProcessImpl(result); + } + + private OfflineSpeakerDiarizationSegment[] ProcessImpl(IntPtr result) + { + if (result == IntPtr.Zero) + { + return new OfflineSpeakerDiarizationSegment[] {}; + } + + int numSegments = SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result); + IntPtr p = SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result); + + OfflineSpeakerDiarizationSegment[] ans = new OfflineSpeakerDiarizationSegment[numSegments]; + unsafe + { + int size = sizeof(float) * 2 + sizeof(int); + for (int i = 0; i != numSegments; ++i) + { + IntPtr t = new IntPtr((byte*)p + i * size); + ans[i] = new OfflineSpeakerDiarizationSegment(t); + + // The following IntPtr.Add() does not support net20 + // ans[i] = new OfflineSpeakerDiarizationSegment(IntPtr.Add(p, i)); + } + } + + + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(p); + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result); + + return ans; + + } + + public void Dispose() + { + Cleanup(); + // Prevent the object from being placed on the + // finalization queue + System.GC.SuppressFinalize(this); + } + + ~OfflineSpeakerDiarization() + { + Cleanup(); + } + + private void Cleanup() + { + SherpaOnnxDestroyOfflineSpeakerDiarization(_handle.Handle); + + // Don't permit the handle to be used again. + _handle = new HandleRef(this, IntPtr.Zero); + } + + private HandleRef _handle; + + public int SampleRate + { + get + { + return SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(_handle.Handle); + } + } + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxCreateOfflineSpeakerDiarization(ref OfflineSpeakerDiarizationConfig config); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxDestroyOfflineSpeakerDiarization(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern int SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcess(IntPtr handle, float[] samples, int n); + + [DllImport(Dll.Filename, CallingConvention = CallingConvention.Cdecl)] + private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback(IntPtr handle, float[] samples, int n, OfflineSpeakerDiarizationProgressCallback callback, IntPtr arg); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroyResult(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern IntPtr SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(IntPtr handle); + + [DllImport(Dll.Filename)] + private static extern void SherpaOnnxOfflineSpeakerDiarizationDestroySegment(IntPtr handle); + } +} + diff --git a/scripts/dotnet/OfflineSpeakerDiarizationConfig.cs b/scripts/dotnet/OfflineSpeakerDiarizationConfig.cs new file mode 100644 index 00000000..94f57039 --- /dev/null +++ b/scripts/dotnet/OfflineSpeakerDiarizationConfig.cs @@ -0,0 +1,31 @@ +/// Copyright (c) 2024 Xiaomi Corporation + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + + [StructLayout(LayoutKind.Sequential)] + public struct OfflineSpeakerDiarizationConfig + { + public OfflineSpeakerDiarizationConfig() + { + Segmentation = new OfflineSpeakerSegmentationModelConfig(); + Embedding = new SpeakerEmbeddingExtractorConfig(); + Clustering = new FastClusteringConfig(); + + MinDurationOn = 0.3F; + MinDurationOff = 0.5F; + } + + public OfflineSpeakerSegmentationModelConfig Segmentation; + public SpeakerEmbeddingExtractorConfig Embedding; + public FastClusteringConfig Clustering; + + public float MinDurationOn; + public float MinDurationOff; + } +} + + + diff --git a/scripts/dotnet/OfflineSpeakerDiarizationSegment.cs b/scripts/dotnet/OfflineSpeakerDiarizationSegment.cs new file mode 100644 index 00000000..8985c977 --- /dev/null +++ b/scripts/dotnet/OfflineSpeakerDiarizationSegment.cs @@ -0,0 +1,33 @@ +/// Copyright (c) 2024 Xiaomi Corporation +using System; +using System.Runtime.InteropServices; +using System.Text; + +namespace SherpaOnnx +{ + + public class OfflineSpeakerDiarizationSegment + { + public OfflineSpeakerDiarizationSegment(IntPtr handle) + { + Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl)); + + Start = impl.Start; + End = impl.End; + Speaker = impl.Speaker; + } + + [StructLayout(LayoutKind.Sequential)] + struct Impl + { + public float Start; + public float End; + public int Speaker; + } + + public float Start; + public float End; + public int Speaker; + } +} + diff --git a/scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs b/scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs new file mode 100644 index 00000000..1bd1f384 --- /dev/null +++ b/scripts/dotnet/OfflineSpeakerSegmentationModelConfig.cs @@ -0,0 +1,32 @@ +/// Copyright (c) 2024 Xiaomi Corporation + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + + [StructLayout(LayoutKind.Sequential)] + public struct OfflineSpeakerSegmentationModelConfig + { + public OfflineSpeakerSegmentationModelConfig() + { + Pyannote = new OfflineSpeakerSegmentationPyannoteModelConfig(); + NumThreads = 1; + Debug = 0; + Provider = "cpu"; + } + + public OfflineSpeakerSegmentationPyannoteModelConfig Pyannote; + + /// Number of threads used to run the neural network model + public int NumThreads; + + /// true to print debug information of the model + public int Debug; + + [MarshalAs(UnmanagedType.LPStr)] + public string Provider; + } +} + + diff --git a/scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs b/scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs new file mode 100644 index 00000000..31976212 --- /dev/null +++ b/scripts/dotnet/OfflineSpeakerSegmentationPyannoteModelConfig.cs @@ -0,0 +1,20 @@ +/// Copyright (c) 2024 Xiaomi Corporation + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + + [StructLayout(LayoutKind.Sequential)] + public struct OfflineSpeakerSegmentationPyannoteModelConfig + { + public OfflineSpeakerSegmentationPyannoteModelConfig() + { + Model = ""; + } + + [MarshalAs(UnmanagedType.LPStr)] + public string Model; + } +} +