Pascal API for speaker diarization (#1420)
This commit is contained in:
15
.github/workflows/pascal.yaml
vendored
15
.github/workflows/pascal.yaml
vendored
@@ -127,6 +127,21 @@ jobs:
|
|||||||
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
|
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Run Pascal test (Speaker diarization)
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
|
||||||
|
|
||||||
|
cd ./pascal-api-examples
|
||||||
|
pushd speaker-diarization
|
||||||
|
|
||||||
|
./run.sh
|
||||||
|
rm -rfv *.onnx *.wav sherpa-onnx-*
|
||||||
|
ls -lh
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
||||||
- name: Run Pascal test (TTS)
|
- name: Run Pascal test (TTS)
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
|
|||||||
|Directory| Description|
|
|Directory| Description|
|
||||||
|---------|------------|
|
|---------|------------|
|
||||||
|[read-wav](./read-wav)|It shows how to read a wave file.|
|
|[read-wav](./read-wav)|It shows how to read a wave file.|
|
||||||
|
|[speaker-diarization](./speaker-diarization)|It shows how to use Pascal API for speaker diarization.|
|
||||||
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|
||||||
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|
||||||
|[vad](./vad)| It shows how to use the voice activity detection API.|
|
|[vad](./vad)| It shows how to use the voice activity detection API.|
|
||||||
|
|||||||
104
pascal-api-examples/speaker-diarization/main.pas
Normal file
104
pascal-api-examples/speaker-diarization/main.pas
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||||
|
{
|
||||||
|
This file shows how to use the Pascal API from sherpa-onnx
|
||||||
|
for speaker diarization.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
Step 1: Download a speaker segmentation model
|
||||||
|
|
||||||
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
||||||
|
for a list of available models. The following is an example
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
|
||||||
|
Step 2: Download a speaker embedding extractor model
|
||||||
|
|
||||||
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
|
||||||
|
for a list of available models. The following is an example
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
|
||||||
|
Step 3. Download test wave files
|
||||||
|
|
||||||
|
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
|
||||||
|
for a list of available test wave files. The following is an example
|
||||||
|
|
||||||
|
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
|
||||||
|
Step 4. Run it
|
||||||
|
}
|
||||||
|
|
||||||
|
program main;
|
||||||
|
|
||||||
|
{$mode delphi}
|
||||||
|
|
||||||
|
uses
|
||||||
|
sherpa_onnx,
|
||||||
|
ctypes,
|
||||||
|
SysUtils;
|
||||||
|
|
||||||
|
function ProgressCallback(
|
||||||
|
NumProcessedChunks: cint32;
|
||||||
|
NumTotalChunks: cint32): cint32; cdecl;
|
||||||
|
var
|
||||||
|
Progress: Single;
|
||||||
|
begin
|
||||||
|
Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
|
||||||
|
WriteLn(Format('Progress: %.3f%%', [Progress]));
|
||||||
|
|
||||||
|
Result := 0;
|
||||||
|
end;
|
||||||
|
|
||||||
|
var
|
||||||
|
Wave: TSherpaOnnxWave;
|
||||||
|
Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
|
||||||
|
Sd: TSherpaOnnxOfflineSpeakerDiarization;
|
||||||
|
Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');
|
||||||
|
|
||||||
|
Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
|
||||||
|
Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';
|
||||||
|
|
||||||
|
{
|
||||||
|
Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
|
||||||
|
set NumClusters to 4 here.
|
||||||
|
If you don't have such information, please set NumClusters to -1.
|
||||||
|
In that case, you have to set Config.Clustering.Threshold.
|
||||||
|
A larger threshold leads to fewer clusters, i.e., fewer speakers.
|
||||||
|
}
|
||||||
|
Config.Clustering.NumClusters := 4;
|
||||||
|
Config.Segmentation.Debug := True;
|
||||||
|
Config.Embedding.Debug := True;
|
||||||
|
|
||||||
|
Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
|
||||||
|
if Sd.GetHandle = nil then
|
||||||
|
begin
|
||||||
|
WriteLn('Please check you config');
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
if Sd.GetSampleRate <> Wave.SampleRate then
|
||||||
|
begin
|
||||||
|
WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
{
|
||||||
|
// If you don't want to use a callback
|
||||||
|
Segments := Sd.Process(Wave.Samples);
|
||||||
|
}
|
||||||
|
Segments := Sd.Process(Wave.Samples, @ProgressCallback);
|
||||||
|
|
||||||
|
for I := Low(Segments) to High(Segments) do
|
||||||
|
begin
|
||||||
|
WriteLn(Format('%.3f -- %.3f speaker_%d',
|
||||||
|
[Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
|
||||||
|
end;
|
||||||
|
|
||||||
|
FreeAndNil(Sd);
|
||||||
|
end.
|
||||||
49
pascal-api-examples/speaker-diarization/run.sh
Executable file
49
pascal-api-examples/speaker-diarization/run.sh
Executable file
@@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||||
|
|
||||||
|
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||||
|
|
||||||
|
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||||
|
mkdir -p ../../build
|
||||||
|
pushd ../../build
|
||||||
|
cmake \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=./install \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
..
|
||||||
|
|
||||||
|
cmake --build . --target install --config Release
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
fpc \
|
||||||
|
-dSHERPA_ONNX_USE_SHARED_LIBS \
|
||||||
|
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||||
|
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||||
|
./main.pas
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||||
|
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
|
||||||
|
if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./0-four-speakers-zh.wav ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
|
||||||
|
fi
|
||||||
|
|
||||||
|
./main
|
||||||
@@ -102,7 +102,7 @@ type
|
|||||||
|
|
||||||
function Generate(Text: AnsiString; SpeakerId: Integer;
|
function Generate(Text: AnsiString; SpeakerId: Integer;
|
||||||
Speed: Single;
|
Speed: Single;
|
||||||
Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
|
Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
|
||||||
Arg: Pointer
|
Arg: Pointer
|
||||||
): TSherpaOnnxGeneratedAudio; overload;
|
): TSherpaOnnxGeneratedAudio; overload;
|
||||||
|
|
||||||
@@ -398,6 +398,78 @@ type
|
|||||||
property GetHandle: Pointer Read Handle;
|
property GetHandle: Pointer Read Handle;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record
|
||||||
|
Model: AnsiString;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineSpeakerSegmentationModelConfig = record
|
||||||
|
Pyannote: TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
|
||||||
|
NumThreads: Integer;
|
||||||
|
Debug: Boolean;
|
||||||
|
Provider: AnsiString;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxFastClusteringConfig = record
|
||||||
|
NumClusters: Integer;
|
||||||
|
Threshold: Single;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxSpeakerEmbeddingExtractorConfig = record
|
||||||
|
Model: AnsiString;
|
||||||
|
NumThreads: Integer;
|
||||||
|
Debug: Boolean;
|
||||||
|
Provider: AnsiString;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineSpeakerDiarizationConfig = record
|
||||||
|
Segmentation: TSherpaOnnxOfflineSpeakerSegmentationModelConfig;
|
||||||
|
Embedding: TSherpaOnnxSpeakerEmbeddingExtractorConfig;
|
||||||
|
Clustering: TSherpaOnnxFastClusteringConfig;
|
||||||
|
MinDurationOn: Single;
|
||||||
|
MinDurationOff: Single;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineSpeakerDiarizationSegment = record
|
||||||
|
Start: Single;
|
||||||
|
Stop: Single;
|
||||||
|
Speaker: Integer;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineSpeakerDiarizationSegmentArray = array of TSherpaOnnxOfflineSpeakerDiarizationSegment;
|
||||||
|
|
||||||
|
PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = ^TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = function(
|
||||||
|
NumProcessChunks: cint32;
|
||||||
|
NumTotalChunks: cint32): cint32; cdecl;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineSpeakerDiarization = class
|
||||||
|
private
|
||||||
|
Handle: Pointer;
|
||||||
|
SampleRate: Integer;
|
||||||
|
_Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
|
||||||
|
public
|
||||||
|
constructor Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
destructor Destroy; override;
|
||||||
|
procedure SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
function Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload;
|
||||||
|
function Process(Samples: array of Single; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload;
|
||||||
|
property GetHandle: Pointer Read Handle;
|
||||||
|
property GetSampleRate: Integer Read SampleRate;
|
||||||
|
end;
|
||||||
|
|
||||||
|
|
||||||
{ It supports reading a single channel wave with 16-bit encoded samples.
|
{ It supports reading a single channel wave with 16-bit encoded samples.
|
||||||
Samples are normalized to the range [-1, 1].
|
Samples are normalized to the range [-1, 1].
|
||||||
}
|
}
|
||||||
@@ -656,6 +728,47 @@ type
|
|||||||
|
|
||||||
PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
|
PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record
|
||||||
|
Model: PAnsiChar;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerSegmentationModelConfig = record
|
||||||
|
Pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig;
|
||||||
|
NumThreads: cint32;
|
||||||
|
Debug: cint32;
|
||||||
|
Provider: PAnsiChar;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxFastClusteringConfig = record
|
||||||
|
NumClusters: cint32;
|
||||||
|
Threshold: cfloat;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxSpeakerEmbeddingExtractorConfig = record
|
||||||
|
Model: PAnsiChar;
|
||||||
|
NumThreads: cint32;
|
||||||
|
Debug: cint32;
|
||||||
|
Provider: PAnsiChar;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationConfig = record
|
||||||
|
Segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig;
|
||||||
|
Embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig;
|
||||||
|
Clustering: SherpaOnnxFastClusteringConfig;
|
||||||
|
MinDurationOn: cfloat;
|
||||||
|
MinDurationOff: cfloat;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationSegment = record
|
||||||
|
Start: cfloat;
|
||||||
|
Stop: cfloat;
|
||||||
|
Speaker: cint32;
|
||||||
|
end;
|
||||||
|
|
||||||
|
PSherpaOnnxOfflineSpeakerDiarizationSegment = ^SherpaOnnxOfflineSpeakerDiarizationSegment;
|
||||||
|
|
||||||
|
PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig;
|
||||||
|
|
||||||
function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
|
function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
|
||||||
SampleRateOutHz: cint32;
|
SampleRateOutHz: cint32;
|
||||||
FilterCutoffHz: cfloat;
|
FilterCutoffHz: cfloat;
|
||||||
@@ -677,6 +790,37 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec
|
|||||||
procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
|
procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
|
||||||
external SherpaOnnxLibName;
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxDestroyOfflineSpeakerDiarization(P: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(P: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxOfflineSpeakerDiarizationSetConfig(P: Pointer; Config: PSherpaOnnxOfflineSpeakerDiarizationConfig); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(P: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(P: Pointer): PSherpaOnnxOfflineSpeakerDiarizationSegment; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxOfflineSpeakerDiarizationDestroySegment(P: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineSpeakerDiarizationProcess(P: Pointer; Samples: pcfloat; N: cint32): Pointer; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(P: Pointer;
|
||||||
|
Samples: pcfloat; N: cint32; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): Pointer; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxOfflineSpeakerDiarizationDestroyResult(P: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
|
function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
|
||||||
external SherpaOnnxLibName;
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
@@ -1773,7 +1917,7 @@ end;
|
|||||||
|
|
||||||
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
|
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
|
||||||
Speed: Single;
|
Speed: Single;
|
||||||
Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
|
Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
|
||||||
Arg: Pointer
|
Arg: Pointer
|
||||||
): TSherpaOnnxGeneratedAudio;
|
): TSherpaOnnxGeneratedAudio;
|
||||||
var
|
var
|
||||||
@@ -1847,4 +1991,195 @@ begin
|
|||||||
SherpaOnnxLinearResamplerReset(Self.Handle);
|
SherpaOnnxLinearResamplerReset(Self.Handle);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' +
|
||||||
|
'Model := %s)',[Self.Model]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineSpeakerSegmentationModelConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' +
|
||||||
|
'Pyannote := %s, ' +
|
||||||
|
'NumThreads := %d, ' +
|
||||||
|
'Debug := %s, ' +
|
||||||
|
'Provider := %s)',
|
||||||
|
[Self.Pyannote.ToString, Self.NumThreads,
|
||||||
|
Self.Debug.ToString, Self.Provider]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineSpeakerSegmentationModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.NumThreads := 1;
|
||||||
|
Dest.Debug := False;
|
||||||
|
Dest.Provider := 'cpu';
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxFastClusteringConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxFastClusteringConfig(' +
|
||||||
|
'NumClusters := %d, Threshold := %.3f)',
|
||||||
|
[Self.NumClusters, Self.Threshold]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxFastClusteringConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig);
|
||||||
|
begin
|
||||||
|
Dest.NumClusters := -1;
|
||||||
|
Dest.Threshold := 0.5;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxSpeakerEmbeddingExtractorConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxSpeakerEmbeddingExtractorConfig(' +
|
||||||
|
'Model := %s, '+
|
||||||
|
'NumThreads := %d, '+
|
||||||
|
'Debug := %s, '+
|
||||||
|
'Provider := %s)',
|
||||||
|
[Self.Model, Self.NumThreads, Self.Debug.ToString, Self.Provider]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxSpeakerEmbeddingExtractorConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig);
|
||||||
|
begin
|
||||||
|
Dest.NumThreads := 1;
|
||||||
|
Dest.Debug := False;
|
||||||
|
Dest.Provider := 'cpu';
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineSpeakerDiarizationConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxOfflineSpeakerDiarizationConfig(' +
|
||||||
|
'Segmentation := %s, '+
|
||||||
|
'Embedding := %s, '+
|
||||||
|
'Clustering := %s, '+
|
||||||
|
'MinDurationOn := %.3f, '+
|
||||||
|
'MinDurationOff := %.3f)',
|
||||||
|
[Self.Segmentation.ToString, Self.Embedding.ToString,
|
||||||
|
Self.Clustering.ToString, Self.MinDurationOn, Self.MinDurationOff]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineSpeakerDiarizationConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
begin
|
||||||
|
Dest.MinDurationOn := 0.2;
|
||||||
|
Dest.MinDurationOff := 0.5;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineSpeakerDiarizationSegment.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxOfflineSpeakerDiarizationSegment(' +
|
||||||
|
'Start := %.3f, '+
|
||||||
|
'Stop := %.3f, '+
|
||||||
|
'Speaker := %d)',
|
||||||
|
[Self.Start, Self.Stop, Self.Speaker]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
constructor TSherpaOnnxOfflineSpeakerDiarization.Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
var
|
||||||
|
C: SherpaOnnxOfflineSpeakerDiarizationConfig;
|
||||||
|
begin
|
||||||
|
C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
C.Segmentation.Pyannote.Model := PAnsiChar(Config.Segmentation.Pyannote.Model);
|
||||||
|
C.Segmentation.NumThreads := Config.Segmentation.NumThreads;
|
||||||
|
C.Segmentation.Debug := Ord(Config.Segmentation.Debug);
|
||||||
|
C.Segmentation.Provider := PAnsiChar(Config.Segmentation.Provider);
|
||||||
|
|
||||||
|
C.Embedding.Model := PAnsiChar(Config.Embedding.Model);
|
||||||
|
C.Embedding.NumThreads := Config.Embedding.NumThreads;
|
||||||
|
C.Embedding.Debug := Ord(Config.Embedding.Debug);
|
||||||
|
C.Embedding.Provider := PAnsiChar(Config.Embedding.Provider);
|
||||||
|
|
||||||
|
C.Clustering.NumClusters := Config.Clustering.NumClusters;
|
||||||
|
C.Clustering.Threshold := Config.Clustering.Threshold;
|
||||||
|
|
||||||
|
C.MinDurationOn := Config.MinDurationOn;
|
||||||
|
C.MinDurationOff := Config.MinDurationOff;
|
||||||
|
|
||||||
|
Self.Handle := SherpaOnnxCreateOfflineSpeakerDiarization(@C);
|
||||||
|
Self._Config := Config;
|
||||||
|
Self.SampleRate := 0;
|
||||||
|
|
||||||
|
if Self.Handle <> nil then
|
||||||
|
begin
|
||||||
|
Self.SampleRate := SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(Self.Handle);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
destructor TSherpaOnnxOfflineSpeakerDiarization.Destroy;
|
||||||
|
begin
|
||||||
|
SherpaOnnxDestroyOfflineSpeakerDiarization(Self.Handle);
|
||||||
|
Self.Handle := nil;
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxOfflineSpeakerDiarization.SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
var
|
||||||
|
C: SherpaOnnxOfflineSpeakerDiarizationConfig;
|
||||||
|
begin
|
||||||
|
C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig);
|
||||||
|
|
||||||
|
C.Clustering.NumClusters := Config.Clustering.NumClusters;
|
||||||
|
C.Clustering.Threshold := Config.Clustering.Threshold;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationSetConfig(Self.Handle, @C);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
|
||||||
|
var
|
||||||
|
R: Pointer;
|
||||||
|
NumSegments: Integer;
|
||||||
|
I: Integer;
|
||||||
|
Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment;
|
||||||
|
begin
|
||||||
|
Result := nil;
|
||||||
|
|
||||||
|
R := SherpaOnnxOfflineSpeakerDiarizationProcess(Self.Handle, pcfloat(Samples), Length(Samples));
|
||||||
|
if R = nil then
|
||||||
|
begin
|
||||||
|
Exit
|
||||||
|
end;
|
||||||
|
NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R);
|
||||||
|
|
||||||
|
Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R);
|
||||||
|
|
||||||
|
SetLength(Result, NumSegments);
|
||||||
|
for I := Low(Result) to High(Result) do
|
||||||
|
begin
|
||||||
|
Result[I].Start := Segments[I].Start;
|
||||||
|
Result[I].Stop := Segments[I].Stop;
|
||||||
|
Result[I].Speaker := Segments[I].Speaker;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments);
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single;
|
||||||
|
callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
|
||||||
|
var
|
||||||
|
R: Pointer;
|
||||||
|
NumSegments: Integer;
|
||||||
|
I: Integer;
|
||||||
|
Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment;
|
||||||
|
begin
|
||||||
|
Result := nil;
|
||||||
|
|
||||||
|
R := SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(Self.Handle, pcfloat(Samples), Length(Samples), callback);
|
||||||
|
if R = nil then
|
||||||
|
begin
|
||||||
|
Exit
|
||||||
|
end;
|
||||||
|
NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R);
|
||||||
|
|
||||||
|
Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R);
|
||||||
|
|
||||||
|
SetLength(Result, NumSegments);
|
||||||
|
for I := Low(Result) to High(Result) do
|
||||||
|
begin
|
||||||
|
Result[I].Start := Segments[I].Start;
|
||||||
|
Result[I].Stop := Segments[I].Stop;
|
||||||
|
Result[I].Speaker := Segments[I].Speaker;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments);
|
||||||
|
SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R);
|
||||||
|
end;
|
||||||
|
|
||||||
end.
|
end.
|
||||||
|
|||||||
Reference in New Issue
Block a user