Pascal API for VAD (#1249)

This commit is contained in:
Fangjun Kuang
2024-08-13 16:16:51 +08:00
committed by GitHub
parent a7dc6c2c16
commit 619279b162
24 changed files with 1199 additions and 14 deletions

View File

@@ -95,6 +95,8 @@ void CircularBuffer::Push(const float *p, int32_t n) {
"capacity to: %d",
n, size, n + size, capacity, new_capacity);
Resize(new_capacity);
capacity = new_capacity;
}
int32_t start = tail_ % capacity;

View File

@@ -2,9 +2,11 @@
unit sherpa_onnx;
{$mode objfpc}
{$IFDEF FPC}
{$mode objfpc}
{$modeSwitch advancedRecords} { to support records with methods }
{$ENDIF}
{$modeSwitch advancedRecords} { to support records with methods }
(* {$LongStrings ON} *)
interface
@@ -45,18 +47,21 @@ type
ModelingUnit: AnsiString;
BpeVocab: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
end;
TSherpaOnnxFeatureConfig = record
SampleRate: Integer;
FeatureDim: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
end;
TSherpaOnnxOnlineCtcFstDecoderConfig = record
Graph: AnsiString;
MaxActive: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
end;
TSherpaOnnxOnlineRecognizerConfig = record
@@ -75,6 +80,7 @@ type
RuleFars: AnsiString;
BlankPenalty: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
end;
TSherpaOnnxOnlineRecognizerResult = record
@@ -97,6 +103,7 @@ type
TSherpaOnnxOnlineRecognizer = class
private
Handle: Pointer;
_Config: TSherpaOnnxOnlineRecognizerConfig;
public
constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig);
destructor Destroy; override;
@@ -108,6 +115,7 @@ type
procedure Reset(Stream: TSherpaOnnxOnlineStream);
function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
property Config: TSherpaOnnxOnlineRecognizerConfig Read _Config;
end;
TSherpaOnnxOfflineTransducerModelConfig = record
@@ -134,6 +142,7 @@ type
Task: AnsiString;
TailPaddings: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
end;
TSherpaOnnxOfflineTdnnModelConfig = record
@@ -145,12 +154,14 @@ type
Model: AnsiString;
Scale: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
end;
TSherpaOnnxOfflineSenseVoiceModelConfig = record
Model: AnsiString;
Language: AnsiString;
UseItn: Boolean;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
function ToString: AnsiString;
end;
@@ -169,6 +180,7 @@ type
BpeVocab: AnsiString;
TeleSpeechCtc: AnsiString;
SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
function ToString: AnsiString;
end;
@@ -183,6 +195,7 @@ type
RuleFsts: AnsiString;
RuleFars: AnsiString;
BlankPenalty: Single;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
function ToString: AnsiString;
end;
@@ -205,18 +218,83 @@ type
TSherpaOnnxOfflineRecognizer = class
private
Handle: Pointer;
_Config: TSherpaOnnxOfflineRecognizerConfig;
public
constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig);
destructor Destroy; override;
function CreateStream: TSherpaOnnxOfflineStream;
procedure Decode(Stream: TSherpaOnnxOfflineStream);
function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config;
end;
{ It supports reading a single channel wave with 16-bit encoded samples.
Samples are normalized to the range [-1, 1].
}
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
TSherpaOnnxSileroVadModelConfig = record
Model: AnsiString;
Threshold: Single;
MinSilenceDuration: Single;
MinSpeechDuration: Single;
WindowSize: Integer;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
end;
TSherpaOnnxVadModelConfig = record
SileroVad: TSherpaOnnxSileroVadModelConfig;
SampleRate: Integer;
NumThreads: Integer;
Provider: AnsiString;
Debug: Boolean;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
end;
TSherpaOnnxSamplesArray = array of Single;
TSherpaOnnxCircularBuffer = class
private
Handle: Pointer;
public
constructor Create(Capacity: Integer);
destructor Destroy; override;
procedure Push(Samples: array of Single);
function Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
procedure Pop(N: Integer);
procedure Reset;
function Size: Integer;
function Head: Integer;
end;
TSherpaOnnxSpeechSegment = record
Samples: array of Single;
Start: Integer;
end;
TSherpaOnnxVoiceActivityDetector = class
private
Handle: Pointer;
_Config: TSherpaOnnxVadModelConfig;
public
constructor Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
destructor Destroy; override;
procedure AcceptWaveform(Samples: array of Single); overload;
procedure AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); overload;
function IsEmpty: Boolean;
function IsDetected: Boolean;
procedure Pop;
procedure Clear;
function Front: TSherpaOnnxSpeechSegment;
procedure Reset;
procedure Flush;
property Config: TSherpaOnnxVadModelConfig Read _Config;
end;
{ It supports reading a single channel wave with 16-bit encoded samples.
Samples are normalized to the range [-1, 1].
}
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
function SherpaOnnxWriteWave(Filename: AnsiString;
Samples: array of Single; SampleRate: Integer): Boolean;
implementation
@@ -294,15 +372,15 @@ type
DecodingMethod: PAnsiChar;
MaxActivePaths: cint32;
EnableEndpoint: cint32;
Rule1MinTrailingSilence: Single;
Rule2MinTrailingSilence: Single;
Rule3MinUtteranceLength: Single;
Rule1MinTrailingSilence: cfloat;
Rule2MinTrailingSilence: cfloat;
Rule3MinUtteranceLength: cfloat;
HotwordsFile: PAnsiChar;
HotwordsScore: Single;
HotwordsScore: cfloat;
CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig;
RuleFsts: PAnsiChar;
RuleFars: PAnsiChar;
BlankPenalty: Single;
BlankPenalty: cfloat;
end;
PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig;
@@ -330,7 +408,7 @@ type
end;
SherpaOnnxOfflineLMConfig = record
Model: PAnsiChar;
Scale: Single;
Scale: cfloat;
end;
SherpaOnnxOfflineSenseVoiceModelConfig = record
Model: PAnsiChar;
@@ -361,14 +439,100 @@ type
DecodingMethod: PAnsiChar;
MaxActivePaths: cint32;
HotwordsFile: PAnsiChar;
HotwordsScore: Single;
HotwordsScore: cfloat;
RuleFsts: PAnsiChar;
RuleFars: PAnsiChar;
BlankPenalty: Single;
BlankPenalty: cfloat;
end;
PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig;
SherpaOnnxSileroVadModelConfig = record
Model: PAnsiChar;
Threshold: cfloat;
MinSilenceDuration: cfloat;
MinSpeechDuration: cfloat;
WindowSize: cint32;
end;
SherpaOnnxVadModelConfig = record
SileroVad: SherpaOnnxSileroVadModelConfig;
SampleRate: cint32;
NumThreads: cint32;
Provider: PAnsiChar;
Debug: cint32;
end;
PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
SherpaOnnxSpeechSegment = record
Start: cint32;
Samples: pcfloat;
N: cint32;
end;
PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
BufferSizeInSeconds: cfloat): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyVoiceActivityDetector(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorAcceptWaveform(Vad: Pointer;
Samples: pcfloat; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorEmpty(Vad: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorDetected(Vad: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorPop(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorClear(Vad: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxVoiceActivityDetectorFront(Vad: Pointer): PSherpaOnnxSpeechSegment; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroySpeechSegment(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorReset(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxVoiceActivityDetectorFlush(P: PSherpaOnnxSpeechSegment); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateCircularBuffer(Capacity: cint32): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyCircularBuffer(Buffer: Pointer) ; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferPush(Buffer: Pointer; Samples: pcfloat; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferGet(Buffer: Pointer; StartIndex: cint32; N: cint32): pcfloat ; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferFree(P: pcfloat); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferPop(Buffer: Pointer; N: cint32); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferSize(Buffer: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCircularBufferHead(Buffer: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxCircularBufferReset(Buffer: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl;
external SherpaOnnxLibName;
@@ -437,9 +601,20 @@ procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl;
function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxReadWave';
function SherpaOnnxWriteWaveWrapper(Samples: pcfloat; N: cint32;
SampleRate: cint32; Filename: PAnsiChar): cint32; cdecl;
external SherpaOnnxLibName name 'SherpaOnnxWriteWave';
procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl;
external SherpaOnnxLibName name 'SherpaOnnxFreeWave';
function SherpaOnnxWriteWave(Filename: AnsiString;
Samples: array of Single; SampleRate: Integer): Boolean;
begin
Result := SherpaOnnxWriteWaveWrapper(pcfloat(Samples), Length(Samples),
SampleRate, PAnsiChar(Filename)) = 1;
end;
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
var
PFilename: PAnsiChar;
@@ -611,6 +786,7 @@ begin
C.BlankPenalty := Config.BlankPenalty;
Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C);
Self._Config := Config;
end;
destructor TSherpaOnnxOnlineRecognizer.Destroy;
@@ -877,6 +1053,7 @@ begin
C.BlankPenalty := Config.BlankPenalty;
Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C);
Self._Config := Config;
end;
destructor TSherpaOnnxOfflineRecognizer.Destroy;
@@ -984,5 +1161,255 @@ begin
[Self.Text, TokensStr, TimestampStr]);
end;
function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxSileroVadModelConfig(' +
'Model := %s, ' +
'Threshold := %.2f, ' +
'MinSilenceDuration := %.2f, ' +
'MinSpeechDuration := %.2f, ' +
'WindowSize := %d' +
')',
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
Self.MinSpeechDuration, Self.WindowSize
]);
end;
class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
begin
Dest.Threshold := 0.5;
Dest.MinSilenceDuration := 0.5;
Dest.MinSpeechDuration := 0.25;
Dest.WindowSize := 512;
end;
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxVadModelConfig(' +
'SileroVad := %s, ' +
'SampleRate := %d, ' +
'NumThreads := %d, ' +
'Provider := %s, ' +
'Debug := %s' +
')',
[Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
Self.Debug.ToString
]);
end;
class operator TSherpaOnnxVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
begin
Dest.SampleRate := 16000;
Dest.NumThreads := 1;
Dest.Provider := 'cpu';
Dest.Debug := False;
end;
class operator TSherpaOnnxFeatureConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
begin
Dest.SampleRate := 16000;
Dest.FeatureDim := 80;
end;
class operator TSherpaOnnxOnlineCtcFstDecoderConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
begin
Dest.MaxActive := 3000;
end;
class operator TSherpaOnnxOnlineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
begin
Dest.DecodingMethod := 'greedy_search';
Dest.EnableEndpoint := False;
Dest.Rule1MinTrailingSilence := 2.4;
Dest.Rule2MinTrailingSilence := 1.2;
Dest.Rule3MinUtteranceLength := 20;
Dest.HotwordsScore := 1.5;
Dest.BlankPenalty := 0;
end;
class operator TSherpaOnnxOnlineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
begin
Dest.NumThreads := 1;
Dest.Provider := 'cpu';
Dest.Debug := False;
end;
class operator TSherpaOnnxOfflineWhisperModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
begin
Dest.Task := 'transcribe';
Dest.TailPaddings := -1;
end;
class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
begin
Dest.Scale := 1.0;
end;
class operator TSherpaOnnxOfflineSenseVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
begin
Dest.UseItn := True;
end;
class operator TSherpaOnnxOfflineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
class operator TSherpaOnnxOfflineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
begin
Dest.DecodingMethod := 'greedy_search';
Dest.MaxActivePaths := 4;
Dest.HotwordsScore := 1.5;
Dest.BlankPenalty := 0;
end;
constructor TSherpaOnnxCircularBuffer.Create(Capacity: Integer);
begin
Self.Handle := SherpaOnnxCreateCircularBuffer(Capacity);
end;
destructor TSherpaOnnxCircularBuffer.Destroy;
begin
SherpaOnnxDestroyCircularBuffer(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxCircularBuffer.Push(Samples: array of Single);
begin
SherpaOnnxCircularBufferPush(Self.Handle, pcfloat(Samples), Length(Samples));
end;
function TSherpaOnnxCircularBuffer.Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
var
P: pcfloat;
I: Integer;
begin
P := SherpaOnnxCircularBufferGet(Self.Handle, StartIndex, N);
Result := nil;
SetLength(Result, N);
for I := Low(Result) to High(Result) do
Result[I] := P[I];
SherpaOnnxCircularBufferFree(P);
end;
procedure TSherpaOnnxCircularBuffer.Pop(N: Integer);
begin
SherpaOnnxCircularBufferPop(Self.Handle, N);
end;
procedure TSherpaOnnxCircularBuffer.Reset;
begin
SherpaOnnxCircularBufferReset(Self.Handle);
end;
function TSherpaOnnxCircularBuffer.Size: Integer;
begin
Result := SherpaOnnxCircularBufferSize(Self.Handle);
end;
function TSherpaOnnxCircularBuffer.Head: Integer;
begin
Result := SherpaOnnxCircularBufferHead(Self.Handle);
end;
constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
var
C: SherpaOnnxVadModelConfig;
begin
Self._Config := Config;
Initialize(C);
C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
C.SileroVad.Threshold := Config.SileroVad.Threshold;
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
C.SampleRate := Config.SampleRate;
C.NumThreads := Config.NumThreads;
C.Provider := PAnsiChar(Config.Provider);
C.Debug := Ord(Config.Debug);
Self.Handle := SherpaOnnxCreateVoiceActivityDetector(@C, BufferSizeInSeconds);
end;
destructor TSherpaOnnxVoiceActivityDetector.Destroy;
begin
SherpaOnnxDestroyVoiceActivityDetector(Self.Handle);
Self.Handle := nil;
end;
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single);
begin
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, pcfloat(Samples), Length(Samples));
end;
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer);
begin
if Offset + N > Length(Samples) then
begin
WriteLn(Format('Invalid arguments!. Array length: %d, Offset: %d, N: %d',
[Length(Samples), Offset, N]
));
Exit;
end;
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle,
pcfloat(Samples) + Offset, N);
end;
function TSherpaOnnxVoiceActivityDetector.IsEmpty: Boolean;
begin
Result := SherpaOnnxVoiceActivityDetectorEmpty(Self.Handle) = 1;
end;
function TSherpaOnnxVoiceActivityDetector.IsDetected: Boolean;
begin
Result := SherpaOnnxVoiceActivityDetectorDetected(Self.Handle) = 1;
end;
procedure TSherpaOnnxVoiceActivityDetector.Pop;
begin
SherpaOnnxVoiceActivityDetectorPop(Self.Handle);
end;
procedure TSherpaOnnxVoiceActivityDetector.Clear;
begin
SherpaOnnxVoiceActivityDetectorClear(Self.Handle);
end;
function TSherpaOnnxVoiceActivityDetector.Front: TSherpaOnnxSpeechSegment;
var
P: PSherpaOnnxSpeechSegment;
I: Integer;
begin
P := SherpaOnnxVoiceActivityDetectorFront(Self.Handle);
Result.Start := P^.Start;
Result.Samples := nil;
SetLength(Result.Samples, P^.N);
for I := Low(Result.Samples) to High(Result.Samples) do
Result.Samples[I] := P^.Samples[I];
SherpaOnnxDestroySpeechSegment(P);
end;
procedure TSherpaOnnxVoiceActivityDetector.Reset;
begin
SherpaOnnxVoiceActivityDetectorReset(Self.Handle);
end;
procedure TSherpaOnnxVoiceActivityDetector.Flush;
begin
SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
end;
end.