Pascal API for speaker diarization (#1420)

2024-10-12 12:28:38 +08:00
parent 1ed803adc1
commit 5e273c5be4
5 changed files with 506 additions and 2 deletions
--- a/pascal-api-examples/README.md
+++ b/pascal-api-examples/README.md
@@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
 |Directory| Description|
 |---------|------------|
 |[read-wav](./read-wav)|It shows how to read a wave file.|
+|[speaker-diarization](./speaker-diarization)|It shows how to use Pascal API for speaker diarization.|
 |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
 |[vad](./vad)| It shows how to use the voice activity detection API.|
--- a/pascal-api-examples/speaker-diarization/main.pas
+++ b/pascal-api-examples/speaker-diarization/main.pas
@@ -0,0 +1,104 @@
+{ Copyright (c)  2024  Xiaomi Corporation }
+{
+This file shows how to use the Pascal API from sherpa-onnx
+for speaker diarization.
+
+Usage:
+
+Step 1: Download a speaker segmentation model
+
+Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+for a list of available models. The following is an example
+
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+
+Step 2: Download a speaker embedding extractor model
+
+Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
+for a list of available models. The following is an example
+
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+
+Step 3. Download test wave files
+
+Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+for a list of available test wave files. The following is an example
+
+  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+
+Step 4. Run it
+}
+
+program main;
+
+{$mode delphi}
+
+uses
+  sherpa_onnx,
+  ctypes,
+  SysUtils;
+
+function ProgressCallback(
+      NumProcessedChunks: cint32;
+      NumTotalChunks: cint32): cint32; cdecl;
+var
+  Progress: Single;
+begin
+  Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
+  WriteLn(Format('Progress: %.3f%%', [Progress]));
+
+  Result := 0;
+end;
+
+var
+  Wave: TSherpaOnnxWave;
+  Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
+  Sd: TSherpaOnnxOfflineSpeakerDiarization;
+  Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
+  I: Integer;
+begin
+  Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');
+
+  Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
+  Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';
+
+  {
+    Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
+    set NumClusters to 4 here.
+    If you don't have such information, please set NumClusters to -1.
+    In that case, you have to set Config.Clustering.Threshold.
+    A larger threshold leads to fewer clusters, i.e., fewer speakers.
+  }
+  Config.Clustering.NumClusters := 4;
+  Config.Segmentation.Debug := True;
+  Config.Embedding.Debug := True;
+
+  Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
+  if Sd.GetHandle = nil then
+    begin
+      WriteLn('Please check you config');
+      Exit;
+    end;
+
+  if Sd.GetSampleRate <> Wave.SampleRate then
+    begin
+      WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
+      Exit;
+    end;
+
+  {
+    // If you don't want to use a callback
+    Segments := Sd.Process(Wave.Samples);
+  }
+  Segments := Sd.Process(Wave.Samples, @ProgressCallback);
+
+  for I := Low(Segments) to High(Segments) do
+    begin
+      WriteLn(Format('%.3f -- %.3f speaker_%d',
+        [Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
+    end;
+
+  FreeAndNil(Sd);
+end.
--- a/pascal-api-examples/speaker-diarization/run.sh
+++ b/pascal-api-examples/speaker-diarization/run.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
+
+echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
+
+if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
+  mkdir -p ../../build
+  pushd ../../build
+  cmake \
+    -DCMAKE_INSTALL_PREFIX=./install \
+    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
+    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
+    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
+    -DBUILD_SHARED_LIBS=ON \
+    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
+    ..
+
+  cmake --build . --target install --config Release
+  popd
+fi
+
+fpc \
+  -dSHERPA_ONNX_USE_SHARED_LIBS \
+  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
+  -Fl$SHERPA_ONNX_DIR/build/install/lib \
+  ./main.pas
+
+export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
+export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
+
+if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+  rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+fi
+
+if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+fi
+
+if [ ! -f ./0-four-speakers-zh.wav ]; then
+  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
+fi
+
+./main