Text to speech API for Object Pascal. (#1273)

This commit is contained in:
Fangjun Kuang
2024-08-20 20:52:16 +08:00
committed by GitHub
parent e34a1a2aa3
commit 5a2aa110b8
14 changed files with 905 additions and 22 deletions

View File

@@ -18,6 +18,7 @@
#include "sherpa-onnx/csrc/offline-punctuation.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/online-recognizer.h"
#include "sherpa-onnx/csrc/resample.h"
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
#include "sherpa-onnx/csrc/speaker-embedding-manager.h"
#include "sherpa-onnx/csrc/spoken-language-identification.h"
@@ -1584,3 +1585,56 @@ const char *SherpaOfflinePunctuationAddPunct(
}
void SherpaOfflinePunctuationFreeText(const char *text) { delete[] text; }
struct SherpaOnnxLinearResampler {
std::unique_ptr<sherpa_onnx::LinearResample> impl;
};
SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
int32_t num_zeros) {
SherpaOnnxLinearResampler *p = new SherpaOnnxLinearResampler;
p->impl = std::make_unique<sherpa_onnx::LinearResample>(
samp_rate_in_hz, samp_rate_out_hz, filter_cutoff_hz, num_zeros);
return p;
}
void SherpaOnnxDestroyLinearResampler(SherpaOnnxLinearResampler *p) {
delete p;
}
const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
int32_t flush) {
std::vector<float> o;
p->impl->Resample(input, input_dim, flush, &o);
float *s = new float[o.size()];
std::copy(o.begin(), o.end(), s);
SherpaOnnxResampleOut *ans = new SherpaOnnxResampleOut;
ans->samples = s;
ans->n = static_cast<int32_t>(o.size());
return ans;
}
void SherpaOnnxLinearResamplerResampleFree(const SherpaOnnxResampleOut *p) {
delete[] p->samples;
delete p;
}
int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
const SherpaOnnxLinearResampler *p) {
return p->impl->GetInputSamplingRate();
}
int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
const SherpaOnnxLinearResampler *p) {
return p->impl->GetOutputSamplingRate();
}
void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
p->impl->Reset();
}

View File

@@ -1315,6 +1315,52 @@ SHERPA_ONNX_API const char *SherpaOfflinePunctuationAddPunct(
SHERPA_ONNX_API void SherpaOfflinePunctuationFreeText(const char *text);
// for resampling
SHERPA_ONNX_API typedef struct SherpaOnnxLinearResampler
SherpaOnnxLinearResampler;
/*
float min_freq = min(sampling_rate_in_hz, samp_rate_out_hz);
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
int32_t lowpass_filter_width = 6;
You can set filter_cutoff_hz to lowpass_cutoff
sand set num_zeros to lowpass_filter_width
*/
// The user has to invoke SherpaOnnxDestroyLinearResampler()
// to free the returned pointer to avoid memory leak
SHERPA_ONNX_API SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
int32_t num_zeros);
SHERPA_ONNX_API void SherpaOnnxDestroyLinearResampler(
SherpaOnnxLinearResampler *p);
SHERPA_ONNX_API void SherpaOnnxLinearResamplerReset(
SherpaOnnxLinearResampler *p);
typedef struct SherpaOnnxResampleOut {
const float *samples;
int32_t n;
} SherpaOnnxResampleOut;
// The user has to invoke SherpaOnnxLinearResamplerResampleFree()
// to free the returned pointer to avoid memory leak.
//
// If this is the last segment, you can set flush to 1; otherwise, please
// set flush to 0
SHERPA_ONNX_API const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
int32_t flush);
SHERPA_ONNX_API void SherpaOnnxLinearResamplerResampleFree(
const SherpaOnnxResampleOut *p);
SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
const SherpaOnnxLinearResampler *p);
SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
const SherpaOnnxLinearResampler *p);
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif

View File

@@ -1,4 +1,9 @@
{ Copyright (c) 2024 Xiaomi Corporation }
{ Copyright (c) 2024 Xiaomi Corporation
Please see
https://github.com/k2-fsa/sherpa-onnx/tree/master/pascal-api-examples
for how to use APIs in this file.
}
unit sherpa_onnx;
@@ -7,13 +12,105 @@ unit sherpa_onnx;
{$modeSwitch advancedRecords} { to support records with methods }
{$ENDIF}
(* {$LongStrings ON} *)
{$LongStrings ON}
interface
uses
ctypes;
type
TSherpaOnnxSamplesArray = array of Single;
TSherpaOnnxLinearResampler = class
private
Handle: Pointer;
InputSampleRate: Integer;
OutputSampleRate: Integer;
public
constructor Create(SampleRateIn: Integer; SampleRateOut: Integer);
destructor Destroy; override;
function Resample(Samples: pcfloat;
N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray; overload;
function Resample(Samples: array of Single;
Flush: Boolean): TSherpaOnnxSamplesArray; overload;
procedure Reset;
property GetInputSampleRate: Integer Read InputSampleRate;
property GetOutputSampleRate: Integer Read OutputSampleRate;
end;
PSherpaOnnxGeneratedAudioCallbackWithArg = ^TSherpaOnnxGeneratedAudioCallbackWithArg;
TSherpaOnnxGeneratedAudioCallbackWithArg = function(
Samples: pcfloat; N: cint32;
Arg: Pointer): cint; cdecl;
TSherpaOnnxOfflineTtsVitsModelConfig = record
Model: AnsiString;
Lexicon: AnsiString;
Tokens: AnsiString;
DataDir: AnsiString;
NoiseScale: Single;
NoiseScaleW: Single;
LengthScale: Single;
DictDir: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
end;
TSherpaOnnxOfflineTtsModelConfig = record
Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
NumThreads: Integer;
Debug: Boolean;
Provider: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
end;
TSherpaOnnxOfflineTtsConfig = record
Model: TSherpaOnnxOfflineTtsModelConfig;
RuleFsts: AnsiString;
MaxNumSentences: Integer;
RuleFars: AnsiString;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
end;
TSherpaOnnxGeneratedAudio = record
Samples: array of Single;
SampleRate: Integer;
end;
TSherpaOnnxOfflineTts = class
private
Handle: Pointer;
SampleRate: Integer;
NumSpeakers: Integer;
_Config: TSherpaOnnxOfflineTtsConfig;
public
constructor Create(Config: TSherpaOnnxOfflineTtsConfig);
destructor Destroy; override;
function Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single): TSherpaOnnxGeneratedAudio; overload;
function Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single;
Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
Arg: Pointer
): TSherpaOnnxGeneratedAudio; overload;
property GetHandle: Pointer Read Handle;
property GetSampleRate: Integer Read SampleRate;
property GetNumSpeakers: Integer Read NumSpeakers;
end;
TSherpaOnnxWave = record
Samples: array of Single; { normalized to the range [-1, 1] }
SampleRate: Integer;
@@ -254,7 +351,6 @@ type
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
end;
TSherpaOnnxSamplesArray = array of Single;
TSherpaOnnxCircularBuffer = class
private
@@ -508,6 +604,94 @@ type
PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
SherpaOnnxOfflineTtsVitsModelConfig = record
Model: PAnsiChar;
Lexicon: PAnsiChar;
Tokens: PAnsiChar;
DataDir: PAnsiChar;
NoiseScale: cfloat;
NoiseScaleW: cfloat;
LengthScale: cfloat;
DictDir: PAnsiChar;
end;
SherpaOnnxOfflineTtsModelConfig = record
Vits: SherpaOnnxOfflineTtsVitsModelConfig;
NumThreads: cint32;
Debug: cint32;
Provider: PAnsiChar;
end;
SherpaOnnxOfflineTtsConfig = record
Model: SherpaOnnxOfflineTtsModelConfig;
RuleFsts: PAnsiChar;
MaxNumSentences: cint32;
RuleFars: PAnsiChar;
end;
PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
SherpaOnnxGeneratedAudio = record
Samples: pcfloat;
N: cint32;
SampleRate: cint32;
end;
PSherpaOnnxGeneratedAudio = ^SherpaOnnxGeneratedAudio;
SherpaOnnxResampleOut = record
Samples: pcfloat;
N: cint32;
end;
PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
SampleRateOutHz: cint32;
FilterCutoffHz: cfloat;
NumZeros: cint32): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyLinearResampler(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxLinearResamplerResample(P: Pointer;
Samples: pcfloat;
N: Integer;
Flush: Integer): PSherpaOnnxResampleOut; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineTts(Tts: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsSampleRate(Tts: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsNumSpeakers(Tts: Pointer): cint32; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsGenerate(Tts: Pointer;
Text: PAnsiChar; Sid: cint32; Speed: cfloat): PSherpaOnnxGeneratedAudio; cdecl;
external SherpaOnnxLibName;
function SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Tts: Pointer;
Text: PAnsiChar; Sid: cint32; Speed: cfloat;
Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
Arg: Pointer): PSherpaOnnxGeneratedAudio; cdecl;
external SherpaOnnxLibName;
procedure SherpaOnnxDestroyOfflineTtsGeneratedAudio(Audio: Pointer); cdecl;
external SherpaOnnxLibName;
function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
BufferSizeInSeconds: cfloat): Pointer; cdecl;
external SherpaOnnxLibName;
@@ -793,8 +977,7 @@ constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecogniz
var
C: SherpaOnnxOnlineRecognizerConfig;
begin
Initialize(C);
C := Default(SherpaOnnxOnlineRecognizerConfig);
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
@@ -1051,8 +1234,7 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn
var
C: SherpaOnnxOfflineRecognizerConfig;
begin
Initialize(C);
C := Default(SherpaOnnxOfflineRecognizerConfig);
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
@@ -1369,12 +1551,11 @@ end;
constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
var
C: SherpaOnnxVadModelConfig;
C: SherpaOnnxVadModelConfig ;
begin
C := Default(SherpaOnnxVadModelConfig);
Self._Config := Config;
Initialize(C);
C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
C.SileroVad.Threshold := Config.SileroVad.Threshold;
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
@@ -1460,5 +1641,197 @@ begin
SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
end;
end.
function TSherpaOnnxOfflineTtsVitsModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsVitsModelConfig(' +
'Model := %s, ' +
'Lexicon := %s, ' +
'Tokens := %s, ' +
'DataDir := %s, ' +
'NoiseScale := %.2f, ' +
'NoiseScaleW := %.2f, ' +
'LengthScale := %.2f, ' +
'DictDir := %s' +
')',
[Self.Model, Self.Lexicon, Self.Tokens, Self.DataDir, Self.NoiseScale,
Self.NoiseScaleW, Self.LengthScale, Self.DictDir
]);
end;
class operator TSherpaOnnxOfflineTtsVitsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
begin
Dest.NoiseScale := 0.667;
Dest.NoiseScaleW := 0.8;
Dest.LengthScale := 1.0;
end;
function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
'Vits := %s, ' +
'NumThreads := %d, ' +
'Debug := %s, ' +
'Provider := %s' +
')',
[Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider
]);
end;
class operator TSherpaOnnxOfflineTtsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
begin
Dest.NumThreads := 1;
Dest.Debug := False;
Dest.Provider := 'cpu';
end;
function TSherpaOnnxOfflineTtsConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineTtsConfig(' +
'Model := %s, ' +
'RuleFsts := %s, ' +
'MaxNumSentences := %d, ' +
'RuleFars := %s' +
')',
[Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars
]);
end;
class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
begin
Dest.MaxNumSentences := 1;
end;
constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
var
C: SherpaOnnxOfflineTtsConfig;
begin
C := Default(SherpaOnnxOfflineTtsConfig);
Self._Config := Config;
C.Model.Vits.Model := PAnsiChar(Config.Model.Vits.Model);
C.Model.Vits.Lexicon := PAnsiChar(Config.Model.Vits.Lexicon);
C.Model.Vits.Tokens := PAnsiChar(Config.Model.Vits.Tokens);
C.Model.Vits.DataDir := PAnsiChar(Config.Model.Vits.DataDir);
C.Model.Vits.NoiseScale := Config.Model.Vits.NoiseScale;
C.Model.Vits.NoiseScaleW := Config.Model.Vits.NoiseScaleW;
C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir);
C.Model.NumThreads := Config.Model.NumThreads;
C.Model.Provider := PAnsiChar(Config.Model.Provider);
C.Model.Debug := Ord(Config.Model.Debug);
C.RuleFsts := PAnsiChar(Config.RuleFsts);
C.MaxNumSentences := Config.MaxNumSentences;
C.RuleFars := PAnsiChar(Config.RuleFars);
Self.Handle := SherpaOnnxCreateOfflineTts(@C);
Self.SampleRate := SherpaOnnxOfflineTtsSampleRate(Self.Handle);
Self.NumSpeakers := SherpaOnnxOfflineTtsNumSpeakers(Self.Handle);
end;
destructor TSherpaOnnxOfflineTts.Destroy;
begin
SherpaOnnxDestroyOfflineTts(Self.Handle);
Self.Handle := nil;
end;
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single): TSherpaOnnxGeneratedAudio;
var
Audio: PSherpaOnnxGeneratedAudio;
I: Integer;
begin
Result := Default(TSherpaOnnxGeneratedAudio);
Audio := SherpaOnnxOfflineTtsGenerate(Self.Handle, PAnsiChar(Text), SpeakerId, Speed);
SetLength(Result.Samples, Audio^.N);
Result.SampleRate := Audio^.SampleRate;
for I := Low(Result.Samples) to High(Result.Samples) do
begin
Result.Samples[I] := Audio^.Samples[I];
end;
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
end;
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
Speed: Single;
Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
Arg: Pointer
): TSherpaOnnxGeneratedAudio;
var
Audio: PSherpaOnnxGeneratedAudio;
I: Integer;
begin
Result := Default(TSherpaOnnxGeneratedAudio);
Audio := SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Self.Handle, PAnsiChar(Text),
SpeakerId, Speed, Callback, Arg);
SetLength(Result.Samples, Audio^.N);
Result.SampleRate := Audio^.SampleRate;
for I := Low(Result.Samples) to High(Result.Samples) do
begin
Result.Samples[I] := Audio^.Samples[I];
end;
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
end;
constructor TSherpaOnnxLinearResampler.Create(SampleRateIn: Integer; SampleRateOut: Integer);
var
MinFreq: Single;
LowpassCutoff: Single;
LowpassFilterWidth: Integer = 6;
begin
if SampleRateIn > SampleRateOut then
MinFreq := SampleRateOut
else
MinFreq := SampleRateIn;
LowpassCutoff := 0.99 * 0.5 * MinFreq;
Self.Handle := SherpaOnnxCreateLinearResampler(SampleRateIn,
SampleRateOut, LowpassCutoff, LowpassFilterWidth);
Self.InputSampleRate := SampleRateIn;
Self.OutputSampleRate := SampleRateOut;
end;
destructor TSherpaOnnxLinearResampler.Destroy;
begin
SherpaOnnxDestroyLinearResampler(Self.Handle);
Self.Handle := nil;
end;
function TSherpaOnnxLinearResampler.Resample(Samples: pcfloat;
N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray;
var
P: PSherpaOnnxResampleOut;
I: Integer;
begin
Result := Default(TSherpaOnnxSamplesArray);
P := SherpaOnnxLinearResamplerResample(Self.Handle, Samples, N, Ord(Flush));
SetLength(Result, P^.N);
for I := Low(Result) to High(Result) do
Result[I] := P^.Samples[I];
SherpaOnnxLinearResamplerResampleFree(P);
end;
function TSherpaOnnxLinearResampler.Resample(Samples: array of Single; Flush: Boolean): TSherpaOnnxSamplesArray;
begin
Result := Self.Resample(pcfloat(Samples), Length(Samples), Flush);
end;
procedure TSherpaOnnxLinearResampler.Reset;
begin
SherpaOnnxLinearResamplerReset(Self.Handle);
end;
end.