105 lines
3.1 KiB
ObjectPascal
105 lines
3.1 KiB
ObjectPascal
{ Copyright (c) 2024 Xiaomi Corporation }
|
|
{
|
|
This file shows how to use the Pascal API from sherpa-onnx
|
|
for speaker diarization.
|
|
|
|
Usage:
|
|
|
|
Step 1: Download a speaker segmentation model
|
|
|
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
|
for a list of available models. The following is an example
|
|
|
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
|
|
|
Step 2: Download a speaker embedding extractor model
|
|
|
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
|
for a list of available models. The following is an example
|
|
|
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
|
|
|
Step 3. Download test wave files
|
|
|
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
|
for a list of available test wave files. The following is an example
|
|
|
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
|
|
|
Step 4. Run it
|
|
}
|
|
|
|
program main;
|
|
|
|
{$mode delphi}
|
|
|
|
uses
|
|
sherpa_onnx,
|
|
ctypes,
|
|
SysUtils;
|
|
|
|
function ProgressCallback(
|
|
NumProcessedChunks: cint32;
|
|
NumTotalChunks: cint32): cint32; cdecl;
|
|
var
|
|
Progress: Single;
|
|
begin
|
|
Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
|
|
WriteLn(Format('Progress: %.3f%%', [Progress]));
|
|
|
|
Result := 0;
|
|
end;
|
|
|
|
var
|
|
Wave: TSherpaOnnxWave;
|
|
Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
|
|
Sd: TSherpaOnnxOfflineSpeakerDiarization;
|
|
Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
|
|
I: Integer;
|
|
begin
|
|
Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');
|
|
|
|
Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
|
|
Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';
|
|
|
|
{
|
|
Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
|
|
set NumClusters to 4 here.
|
|
If you don't have such information, please set NumClusters to -1.
|
|
In that case, you have to set Config.Clustering.Threshold.
|
|
A larger threshold leads to fewer clusters, i.e., fewer speakers.
|
|
}
|
|
Config.Clustering.NumClusters := 4;
|
|
Config.Segmentation.Debug := True;
|
|
Config.Embedding.Debug := True;
|
|
|
|
Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
|
|
if Sd.GetHandle = nil then
|
|
begin
|
|
WriteLn('Please check you config');
|
|
Exit;
|
|
end;
|
|
|
|
if Sd.GetSampleRate <> Wave.SampleRate then
|
|
begin
|
|
WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
|
|
Exit;
|
|
end;
|
|
|
|
{
|
|
// If you don't want to use a callback
|
|
Segments := Sd.Process(Wave.Samples);
|
|
}
|
|
Segments := Sd.Process(Wave.Samples, @ProgressCallback);
|
|
|
|
for I := Low(Segments) to High(Segments) do
|
|
begin
|
|
WriteLn(Format('%.3f -- %.3f speaker_%d',
|
|
[Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
|
|
end;
|
|
|
|
FreeAndNil(Sd);
|
|
end.
|