enginex_bi_series-sherpa-onnx/pascal-api-examples/speaker-diarization/main.pas

{ Copyright (c)  2024  Xiaomi Corporation }
{
This file shows how to use the Pascal API from sherpa-onnx
for speaker diarization.

Usage:

Step 1: Download a speaker segmentation model

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example

  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
  rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2

Step 2: Download a speaker embedding extractor model

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example

  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx

Step 3. Download test wave files

Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example

  wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav

Step 4. Run it
}

program main;

{$mode delphi}

uses
  sherpa_onnx,
  ctypes,
  SysUtils;

function ProgressCallback(
      NumProcessedChunks: cint32;
      NumTotalChunks: cint32): cint32; cdecl;
var
  Progress: Single;
begin
  Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
  WriteLn(Format('Progress: %.3f%%', [Progress]));

  Result := 0;
end;

var
  Wave: TSherpaOnnxWave;
  Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
  Sd: TSherpaOnnxOfflineSpeakerDiarization;
  Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
  I: Integer;
begin
  Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');

  Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
  Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';

  {
    Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
    set NumClusters to 4 here.
    If you don't have such information, please set NumClusters to -1.
    In that case, you have to set Config.Clustering.Threshold.
    A larger threshold leads to fewer clusters, i.e., fewer speakers.
  }
  Config.Clustering.NumClusters := 4;
  Config.Segmentation.Debug := True;
  Config.Embedding.Debug := True;

  Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
  if Sd.GetHandle = nil then
    begin
      WriteLn('Please check you config');
      Exit;
    end;

  if Sd.GetSampleRate <> Wave.SampleRate then
    begin
      WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
      Exit;
    end;

  {
    // If you don't want to use a callback
    Segments := Sd.Process(Wave.Samples);
  }
  Segments := Sd.Process(Wave.Samples, @ProgressCallback);

  for I := Low(Segments) to High(Segments) do
    begin
      WriteLn(Format('%.3f -- %.3f speaker_%d',
        [Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
    end;

  FreeAndNil(Sd);
end.