Text to speech API for Object Pascal. (#1273)
This commit is contained in:
26
.github/workflows/pascal.yaml
vendored
26
.github/workflows/pascal.yaml
vendored
@@ -119,13 +119,29 @@ jobs:
|
|||||||
cp -v install/lib/*.dll ../pascal-api-examples/vad
|
cp -v install/lib/*.dll ../pascal-api-examples/vad
|
||||||
cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
|
cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
|
||||||
|
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
|
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
|
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
|
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
|
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
|
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
|
||||||
|
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Run Pascal test (TTS)
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
|
||||||
|
|
||||||
|
cd ./pascal-api-examples
|
||||||
|
pushd tts
|
||||||
|
|
||||||
|
./run-piper.sh
|
||||||
|
rm -rf vits-piper-*
|
||||||
|
ls -lh
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
||||||
- name: Run Pascal test (VAD + non-streaming ASR)
|
- name: Run Pascal test (VAD + non-streaming ASR)
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
1
pascal-api-examples/.gitignore
vendored
Normal file
1
pascal-api-examples/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
link*.res
|
||||||
@@ -13,3 +13,5 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
|
|||||||
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|
||||||
|[vad](./vad)| It shows how to use the voice activity detection API.|
|
|[vad](./vad)| It shows how to use the voice activity detection API.|
|
||||||
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
|
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
|
||||||
|
|[portaudio-test](./portaudio-test)| It shows how to use PortAudio for recording and playing.|
|
||||||
|
|[tts](./tts)| It shows how to use the text-to-speech API.|
|
||||||
|
|||||||
4
pascal-api-examples/tts/.gitignore
vendored
Normal file
4
pascal-api-examples/tts/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
!run-*.sh
|
||||||
|
piper
|
||||||
|
piper-playback
|
||||||
|
link*.res
|
||||||
9
pascal-api-examples/tts/README.md
Normal file
9
pascal-api-examples/tts/README.md
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Introduction
|
||||||
|
|
||||||
|
This directory contains examples for how to use the TTS (text to speech) APIs.
|
||||||
|
|
||||||
|
|Directory| Description|
|
||||||
|
|---------|------------|
|
||||||
|
|[run-piper.sh](./run-piper.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech.|
|
||||||
|
|[run-piper-playback.sh](./run-piper-playback.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech. It plays the generated audio as it is still generating. |
|
||||||
|
|
||||||
238
pascal-api-examples/tts/piper-playback.pas
Normal file
238
pascal-api-examples/tts/piper-playback.pas
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||||
|
program piper;
|
||||||
|
{
|
||||||
|
This file shows how to use the text to speech API of sherpa-onnx
|
||||||
|
with Piper models.
|
||||||
|
|
||||||
|
It generates speech from text and saves it to a wave file.
|
||||||
|
|
||||||
|
Note that it plays the audio back as it is still generating.
|
||||||
|
}
|
||||||
|
|
||||||
|
{$mode objfpc}
|
||||||
|
|
||||||
|
uses
|
||||||
|
{$ifdef unix}
|
||||||
|
cthreads,
|
||||||
|
{$endif}
|
||||||
|
SysUtils,
|
||||||
|
dos,
|
||||||
|
ctypes,
|
||||||
|
portaudio,
|
||||||
|
sherpa_onnx;
|
||||||
|
|
||||||
|
var
|
||||||
|
CriticalSection: TRTLCriticalSection;
|
||||||
|
|
||||||
|
Tts: TSherpaOnnxOfflineTts;
|
||||||
|
Audio: TSherpaOnnxGeneratedAudio;
|
||||||
|
Resampler: TSherpaOnnxLinearResampler;
|
||||||
|
|
||||||
|
Text: AnsiString;
|
||||||
|
Speed: Single = 1.0; {Use a larger value to speak faster}
|
||||||
|
SpeakerId: Integer = 0;
|
||||||
|
Buffer: TSherpaOnnxCircularBuffer;
|
||||||
|
FinishedGeneration: Boolean = False;
|
||||||
|
FinishedPlaying: Boolean = False;
|
||||||
|
|
||||||
|
Version: String;
|
||||||
|
EnvStr: String;
|
||||||
|
Status: Integer;
|
||||||
|
NumDevices: Integer;
|
||||||
|
DeviceIndex: Integer;
|
||||||
|
DeviceInfo: PPaDeviceInfo;
|
||||||
|
|
||||||
|
{ If you get EDivByZero: Division by zero error, please change the sample rate
|
||||||
|
to the one supported by your microphone.
|
||||||
|
}
|
||||||
|
DeviceSampleRate: Integer = 48000;
|
||||||
|
I: Integer;
|
||||||
|
Param: TPaStreamParameters;
|
||||||
|
Stream: PPaStream;
|
||||||
|
Wave: TSherpaOnnxWave;
|
||||||
|
|
||||||
|
function GenerateCallback(
|
||||||
|
Samples: pcfloat; N: cint32;
|
||||||
|
Arg: Pointer): cint; cdecl;
|
||||||
|
begin
|
||||||
|
EnterCriticalSection(CriticalSection);
|
||||||
|
try
|
||||||
|
if Resampler <> nil then
|
||||||
|
Buffer.Push(Resampler.Resample(Samples, N, False))
|
||||||
|
else
|
||||||
|
Buffer.Push(Samples, N);
|
||||||
|
finally
|
||||||
|
LeaveCriticalSection(CriticalSection);
|
||||||
|
end;
|
||||||
|
|
||||||
|
{ 1 means to continue generating; 0 means to stop generating. }
|
||||||
|
Result := 1;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function PlayCallback(
|
||||||
|
input: Pointer; output: Pointer;
|
||||||
|
frameCount: culong;
|
||||||
|
timeInfo: PPaStreamCallbackTimeInfo;
|
||||||
|
statusFlags: TPaStreamCallbackFlags;
|
||||||
|
userData: Pointer ): cint; cdecl;
|
||||||
|
var
|
||||||
|
Samples: TSherpaOnnxSamplesArray;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
EnterCriticalSection(CriticalSection);
|
||||||
|
try
|
||||||
|
if Buffer.Size >= frameCount then
|
||||||
|
begin
|
||||||
|
Samples := Buffer.Get(Buffer.Head, FrameCount);
|
||||||
|
Buffer.Pop(FrameCount);
|
||||||
|
end
|
||||||
|
else if Buffer.Size > 0 then
|
||||||
|
begin
|
||||||
|
Samples := Buffer.Get(Buffer.Head, Buffer.Size);
|
||||||
|
Buffer.Pop(Buffer.Size);
|
||||||
|
SetLength(Samples, frameCount);
|
||||||
|
end
|
||||||
|
else
|
||||||
|
SetLength(Samples, frameCount);
|
||||||
|
|
||||||
|
for I := 0 to frameCount - 1 do
|
||||||
|
pcfloat(output)[I] := Samples[I];
|
||||||
|
|
||||||
|
if (Buffer.Size > 0) or (not FinishedGeneration) then
|
||||||
|
Result := paContinue
|
||||||
|
else
|
||||||
|
begin
|
||||||
|
Result := paComplete;
|
||||||
|
FinishedPlaying := True;
|
||||||
|
end;
|
||||||
|
finally
|
||||||
|
LeaveCriticalSection(CriticalSection);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function GetOfflineTts: TSherpaOnnxOfflineTts;
|
||||||
|
var
|
||||||
|
Config: TSherpaOnnxOfflineTtsConfig;
|
||||||
|
begin
|
||||||
|
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
|
||||||
|
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
|
||||||
|
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
|
||||||
|
Config.Model.NumThreads := 1;
|
||||||
|
Config.Model.Debug := False;
|
||||||
|
Config.MaxNumSentences := 1;
|
||||||
|
|
||||||
|
Result := TSherpaOnnxOfflineTts.Create(Config);
|
||||||
|
end;
|
||||||
|
|
||||||
|
begin
|
||||||
|
Tts := GetOfflineTts;
|
||||||
|
if Tts.GetSampleRate <> DeviceSampleRate then
|
||||||
|
Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
|
||||||
|
|
||||||
|
Version := String(Pa_GetVersionText);
|
||||||
|
WriteLn('Version is ', Version);
|
||||||
|
Status := Pa_Initialize;
|
||||||
|
if Status <> paNoError then
|
||||||
|
begin
|
||||||
|
WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
NumDevices := Pa_GetDeviceCount;
|
||||||
|
WriteLn('Num devices: ', NumDevices);
|
||||||
|
|
||||||
|
DeviceIndex := Pa_GetDefaultOutputDevice;
|
||||||
|
|
||||||
|
if DeviceIndex = paNoDevice then
|
||||||
|
begin
|
||||||
|
WriteLn('No default output device found');
|
||||||
|
Pa_Terminate;
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
|
||||||
|
if EnvStr <> '' then
|
||||||
|
begin
|
||||||
|
DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
|
||||||
|
WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
|
||||||
|
end;
|
||||||
|
|
||||||
|
for I := 0 to (NumDevices - 1) do
|
||||||
|
begin
|
||||||
|
DeviceInfo := Pa_GetDeviceInfo(I);
|
||||||
|
if I = DeviceIndex then
|
||||||
|
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
|
||||||
|
WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
|
||||||
|
else
|
||||||
|
WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
|
||||||
|
end;
|
||||||
|
|
||||||
|
WriteLn('Use device ', DeviceIndex);
|
||||||
|
WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
|
||||||
|
WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
|
||||||
|
|
||||||
|
Initialize(Param);
|
||||||
|
Param.Device := DeviceIndex;
|
||||||
|
Param.ChannelCount := 1;
|
||||||
|
Param.SampleFormat := paFloat32;
|
||||||
|
param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
|
||||||
|
param.HostApiSpecificStreamInfo := nil;
|
||||||
|
|
||||||
|
Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
|
||||||
|
|
||||||
|
|
||||||
|
{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
|
||||||
|
Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
|
||||||
|
PPaStreamCallback(@PlayCallback), nil);
|
||||||
|
|
||||||
|
if Status <> paNoError then
|
||||||
|
begin
|
||||||
|
WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
|
||||||
|
Pa_Terminate;
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
InitCriticalSection(CriticalSection);
|
||||||
|
|
||||||
|
Status := Pa_StartStream(stream);
|
||||||
|
if Status <> paNoError then
|
||||||
|
begin
|
||||||
|
WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
|
||||||
|
Pa_Terminate;
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
|
||||||
|
|
||||||
|
Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
|
||||||
|
|
||||||
|
Audio := Tts.Generate(Text, SpeakerId, Speed,
|
||||||
|
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
|
||||||
|
FinishedGeneration := True;
|
||||||
|
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
|
||||||
|
WriteLn('Saved to ./libritts_r-generated.wav');
|
||||||
|
|
||||||
|
while not FinishedPlaying do
|
||||||
|
Pa_Sleep(100); {sleep for 0.1 second }
|
||||||
|
{TODO(fangjun): Use an event to indicate the play is finished}
|
||||||
|
|
||||||
|
DoneCriticalSection(CriticalSection);
|
||||||
|
|
||||||
|
FreeAndNil(Tts);
|
||||||
|
FreeAndNil(Resampler);
|
||||||
|
|
||||||
|
Status := Pa_CloseStream(stream);
|
||||||
|
if Status <> paNoError then
|
||||||
|
begin
|
||||||
|
WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
Status := Pa_Terminate;
|
||||||
|
if Status <> paNoError then
|
||||||
|
begin
|
||||||
|
WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
end.
|
||||||
|
|
||||||
54
pascal-api-examples/tts/piper.pas
Normal file
54
pascal-api-examples/tts/piper.pas
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||||
|
program piper;
|
||||||
|
{
|
||||||
|
This file shows how to use the text to speech API of sherpa-onnx
|
||||||
|
with Piper models.
|
||||||
|
|
||||||
|
It generates speech from text and saves it to a wave file.
|
||||||
|
|
||||||
|
If you want to play it while it is generating, please see
|
||||||
|
./piper-playback.pas
|
||||||
|
}
|
||||||
|
|
||||||
|
{$mode objfpc}
|
||||||
|
|
||||||
|
uses
|
||||||
|
SysUtils,
|
||||||
|
sherpa_onnx;
|
||||||
|
|
||||||
|
function GetOfflineTts: TSherpaOnnxOfflineTts;
|
||||||
|
var
|
||||||
|
Config: TSherpaOnnxOfflineTtsConfig;
|
||||||
|
begin
|
||||||
|
Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
|
||||||
|
Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
|
||||||
|
Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
|
||||||
|
Config.Model.NumThreads := 1;
|
||||||
|
Config.Model.Debug := False;
|
||||||
|
Config.MaxNumSentences := 1;
|
||||||
|
|
||||||
|
Result := TSherpaOnnxOfflineTts.Create(Config);
|
||||||
|
end;
|
||||||
|
|
||||||
|
var
|
||||||
|
Tts: TSherpaOnnxOfflineTts;
|
||||||
|
Audio: TSherpaOnnxGeneratedAudio;
|
||||||
|
|
||||||
|
Text: AnsiString;
|
||||||
|
Speed: Single = 1.0; {Use a larger value to speak faster}
|
||||||
|
SpeakerId: Integer = 0;
|
||||||
|
|
||||||
|
begin
|
||||||
|
Tts := GetOfflineTts;
|
||||||
|
|
||||||
|
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
|
||||||
|
|
||||||
|
Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
|
||||||
|
|
||||||
|
Audio := Tts.Generate(Text, SpeakerId, Speed);
|
||||||
|
SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
|
||||||
|
WriteLn('Saved to ./libritts_r-generated.wav');
|
||||||
|
|
||||||
|
FreeAndNil(Tts);
|
||||||
|
end.
|
||||||
|
|
||||||
45
pascal-api-examples/tts/run-piper-playback.sh
Executable file
45
pascal-api-examples/tts/run-piper-playback.sh
Executable file
@@ -0,0 +1,45 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||||
|
|
||||||
|
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||||
|
|
||||||
|
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||||
|
mkdir -p ../../build
|
||||||
|
pushd ../../build
|
||||||
|
cmake \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=./install \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
..
|
||||||
|
|
||||||
|
cmake --build . --target install --config Release
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
|
||||||
|
tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
|
||||||
|
rm vits-piper-en_US-libritts_r-medium.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
fpc \
|
||||||
|
-dSHERPA_ONNX_USE_SHARED_LIBS \
|
||||||
|
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||||
|
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||||
|
-Fl/usr/local/Cellar/portaudio/19.7.0/lib \
|
||||||
|
./piper-playback.pas
|
||||||
|
|
||||||
|
# Please see ../portaudio-test/README.md
|
||||||
|
# for how to install portaudio on macOS
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||||
|
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
|
||||||
|
./piper-playback
|
||||||
41
pascal-api-examples/tts/run-piper.sh
Executable file
41
pascal-api-examples/tts/run-piper.sh
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||||
|
|
||||||
|
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||||
|
|
||||||
|
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||||
|
mkdir -p ../../build
|
||||||
|
pushd ../../build
|
||||||
|
cmake \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=./install \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
..
|
||||||
|
|
||||||
|
cmake --build . --target install --config Release
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
|
||||||
|
tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
|
||||||
|
rm vits-piper-en_US-libritts_r-medium.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
fpc \
|
||||||
|
-dSHERPA_ONNX_USE_SHARED_LIBS \
|
||||||
|
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||||
|
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||||
|
./piper.pas
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||||
|
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
|
||||||
|
./piper
|
||||||
@@ -190,9 +190,9 @@ def get_piper_models() -> List[TtsModel]:
|
|||||||
TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
|
TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
|
||||||
TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
|
TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
|
||||||
TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
|
TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
|
||||||
TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
|
# TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
|
||||||
TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
|
# TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
|
||||||
TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
|
# TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
|
||||||
TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
|
TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
|
||||||
TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
|
TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
|
||||||
TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
|
TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
|
||||||
|
|||||||
@@ -180,9 +180,9 @@ def get_piper_models() -> List[TtsModel]:
|
|||||||
TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
|
TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
|
||||||
TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
|
TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
|
||||||
TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
|
TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
|
||||||
TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
|
# TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
|
||||||
TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
|
# TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
|
||||||
TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
|
# TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
|
||||||
TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
|
TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
|
||||||
TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
|
TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
|
||||||
TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
|
TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
#include "sherpa-onnx/csrc/offline-punctuation.h"
|
#include "sherpa-onnx/csrc/offline-punctuation.h"
|
||||||
#include "sherpa-onnx/csrc/offline-recognizer.h"
|
#include "sherpa-onnx/csrc/offline-recognizer.h"
|
||||||
#include "sherpa-onnx/csrc/online-recognizer.h"
|
#include "sherpa-onnx/csrc/online-recognizer.h"
|
||||||
|
#include "sherpa-onnx/csrc/resample.h"
|
||||||
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
|
#include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
|
||||||
#include "sherpa-onnx/csrc/speaker-embedding-manager.h"
|
#include "sherpa-onnx/csrc/speaker-embedding-manager.h"
|
||||||
#include "sherpa-onnx/csrc/spoken-language-identification.h"
|
#include "sherpa-onnx/csrc/spoken-language-identification.h"
|
||||||
@@ -1584,3 +1585,56 @@ const char *SherpaOfflinePunctuationAddPunct(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SherpaOfflinePunctuationFreeText(const char *text) { delete[] text; }
|
void SherpaOfflinePunctuationFreeText(const char *text) { delete[] text; }
|
||||||
|
|
||||||
|
struct SherpaOnnxLinearResampler {
|
||||||
|
std::unique_ptr<sherpa_onnx::LinearResample> impl;
|
||||||
|
};
|
||||||
|
|
||||||
|
SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
|
||||||
|
int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
|
||||||
|
int32_t num_zeros) {
|
||||||
|
SherpaOnnxLinearResampler *p = new SherpaOnnxLinearResampler;
|
||||||
|
p->impl = std::make_unique<sherpa_onnx::LinearResample>(
|
||||||
|
samp_rate_in_hz, samp_rate_out_hz, filter_cutoff_hz, num_zeros);
|
||||||
|
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxDestroyLinearResampler(SherpaOnnxLinearResampler *p) {
|
||||||
|
delete p;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
|
||||||
|
SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
|
||||||
|
int32_t flush) {
|
||||||
|
std::vector<float> o;
|
||||||
|
p->impl->Resample(input, input_dim, flush, &o);
|
||||||
|
|
||||||
|
float *s = new float[o.size()];
|
||||||
|
std::copy(o.begin(), o.end(), s);
|
||||||
|
|
||||||
|
SherpaOnnxResampleOut *ans = new SherpaOnnxResampleOut;
|
||||||
|
ans->samples = s;
|
||||||
|
ans->n = static_cast<int32_t>(o.size());
|
||||||
|
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxLinearResamplerResampleFree(const SherpaOnnxResampleOut *p) {
|
||||||
|
delete[] p->samples;
|
||||||
|
delete p;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
|
||||||
|
const SherpaOnnxLinearResampler *p) {
|
||||||
|
return p->impl->GetInputSamplingRate();
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
|
||||||
|
const SherpaOnnxLinearResampler *p) {
|
||||||
|
return p->impl->GetOutputSamplingRate();
|
||||||
|
}
|
||||||
|
|
||||||
|
void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
|
||||||
|
p->impl->Reset();
|
||||||
|
}
|
||||||
|
|||||||
@@ -1315,6 +1315,52 @@ SHERPA_ONNX_API const char *SherpaOfflinePunctuationAddPunct(
|
|||||||
|
|
||||||
SHERPA_ONNX_API void SherpaOfflinePunctuationFreeText(const char *text);
|
SHERPA_ONNX_API void SherpaOfflinePunctuationFreeText(const char *text);
|
||||||
|
|
||||||
|
// for resampling
|
||||||
|
SHERPA_ONNX_API typedef struct SherpaOnnxLinearResampler
|
||||||
|
SherpaOnnxLinearResampler;
|
||||||
|
|
||||||
|
/*
|
||||||
|
float min_freq = min(sampling_rate_in_hz, samp_rate_out_hz);
|
||||||
|
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
||||||
|
int32_t lowpass_filter_width = 6;
|
||||||
|
|
||||||
|
You can set filter_cutoff_hz to lowpass_cutoff
|
||||||
|
sand set num_zeros to lowpass_filter_width
|
||||||
|
*/
|
||||||
|
// The user has to invoke SherpaOnnxDestroyLinearResampler()
|
||||||
|
// to free the returned pointer to avoid memory leak
|
||||||
|
SHERPA_ONNX_API SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
|
||||||
|
int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
|
||||||
|
int32_t num_zeros);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxDestroyLinearResampler(
|
||||||
|
SherpaOnnxLinearResampler *p);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxLinearResamplerReset(
|
||||||
|
SherpaOnnxLinearResampler *p);
|
||||||
|
|
||||||
|
typedef struct SherpaOnnxResampleOut {
|
||||||
|
const float *samples;
|
||||||
|
int32_t n;
|
||||||
|
} SherpaOnnxResampleOut;
|
||||||
|
// The user has to invoke SherpaOnnxLinearResamplerResampleFree()
|
||||||
|
// to free the returned pointer to avoid memory leak.
|
||||||
|
//
|
||||||
|
// If this is the last segment, you can set flush to 1; otherwise, please
|
||||||
|
// set flush to 0
|
||||||
|
SHERPA_ONNX_API const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
|
||||||
|
SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
|
||||||
|
int32_t flush);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API void SherpaOnnxLinearResamplerResampleFree(
|
||||||
|
const SherpaOnnxResampleOut *p);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
|
||||||
|
const SherpaOnnxLinearResampler *p);
|
||||||
|
|
||||||
|
SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
|
||||||
|
const SherpaOnnxLinearResampler *p);
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -1,4 +1,9 @@
|
|||||||
{ Copyright (c) 2024 Xiaomi Corporation }
|
{ Copyright (c) 2024 Xiaomi Corporation
|
||||||
|
|
||||||
|
Please see
|
||||||
|
https://github.com/k2-fsa/sherpa-onnx/tree/master/pascal-api-examples
|
||||||
|
for how to use APIs in this file.
|
||||||
|
}
|
||||||
|
|
||||||
unit sherpa_onnx;
|
unit sherpa_onnx;
|
||||||
|
|
||||||
@@ -7,13 +12,105 @@ unit sherpa_onnx;
|
|||||||
{$modeSwitch advancedRecords} { to support records with methods }
|
{$modeSwitch advancedRecords} { to support records with methods }
|
||||||
{$ENDIF}
|
{$ENDIF}
|
||||||
|
|
||||||
(* {$LongStrings ON} *)
|
{$LongStrings ON}
|
||||||
|
|
||||||
interface
|
interface
|
||||||
uses
|
uses
|
||||||
ctypes;
|
ctypes;
|
||||||
|
|
||||||
type
|
type
|
||||||
|
TSherpaOnnxSamplesArray = array of Single;
|
||||||
|
|
||||||
|
TSherpaOnnxLinearResampler = class
|
||||||
|
private
|
||||||
|
Handle: Pointer;
|
||||||
|
InputSampleRate: Integer;
|
||||||
|
OutputSampleRate: Integer;
|
||||||
|
public
|
||||||
|
constructor Create(SampleRateIn: Integer; SampleRateOut: Integer);
|
||||||
|
destructor Destroy; override;
|
||||||
|
|
||||||
|
function Resample(Samples: pcfloat;
|
||||||
|
N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray; overload;
|
||||||
|
|
||||||
|
function Resample(Samples: array of Single;
|
||||||
|
Flush: Boolean): TSherpaOnnxSamplesArray; overload;
|
||||||
|
|
||||||
|
procedure Reset;
|
||||||
|
|
||||||
|
property GetInputSampleRate: Integer Read InputSampleRate;
|
||||||
|
property GetOutputSampleRate: Integer Read OutputSampleRate;
|
||||||
|
end;
|
||||||
|
|
||||||
|
PSherpaOnnxGeneratedAudioCallbackWithArg = ^TSherpaOnnxGeneratedAudioCallbackWithArg;
|
||||||
|
|
||||||
|
TSherpaOnnxGeneratedAudioCallbackWithArg = function(
|
||||||
|
Samples: pcfloat; N: cint32;
|
||||||
|
Arg: Pointer): cint; cdecl;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineTtsVitsModelConfig = record
|
||||||
|
Model: AnsiString;
|
||||||
|
Lexicon: AnsiString;
|
||||||
|
Tokens: AnsiString;
|
||||||
|
DataDir: AnsiString;
|
||||||
|
NoiseScale: Single;
|
||||||
|
NoiseScaleW: Single;
|
||||||
|
LengthScale: Single;
|
||||||
|
DictDir: AnsiString;
|
||||||
|
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineTtsModelConfig = record
|
||||||
|
Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
|
||||||
|
NumThreads: Integer;
|
||||||
|
Debug: Boolean;
|
||||||
|
Provider: AnsiString;
|
||||||
|
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineTtsConfig = record
|
||||||
|
Model: TSherpaOnnxOfflineTtsModelConfig;
|
||||||
|
RuleFsts: AnsiString;
|
||||||
|
MaxNumSentences: Integer;
|
||||||
|
RuleFars: AnsiString;
|
||||||
|
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxGeneratedAudio = record
|
||||||
|
Samples: array of Single;
|
||||||
|
SampleRate: Integer;
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxOfflineTts = class
|
||||||
|
private
|
||||||
|
Handle: Pointer;
|
||||||
|
SampleRate: Integer;
|
||||||
|
NumSpeakers: Integer;
|
||||||
|
_Config: TSherpaOnnxOfflineTtsConfig;
|
||||||
|
public
|
||||||
|
constructor Create(Config: TSherpaOnnxOfflineTtsConfig);
|
||||||
|
destructor Destroy; override;
|
||||||
|
|
||||||
|
function Generate(Text: AnsiString; SpeakerId: Integer;
|
||||||
|
Speed: Single): TSherpaOnnxGeneratedAudio; overload;
|
||||||
|
|
||||||
|
function Generate(Text: AnsiString; SpeakerId: Integer;
|
||||||
|
Speed: Single;
|
||||||
|
Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
|
||||||
|
Arg: Pointer
|
||||||
|
): TSherpaOnnxGeneratedAudio; overload;
|
||||||
|
|
||||||
|
property GetHandle: Pointer Read Handle;
|
||||||
|
property GetSampleRate: Integer Read SampleRate;
|
||||||
|
property GetNumSpeakers: Integer Read NumSpeakers;
|
||||||
|
end;
|
||||||
|
|
||||||
TSherpaOnnxWave = record
|
TSherpaOnnxWave = record
|
||||||
Samples: array of Single; { normalized to the range [-1, 1] }
|
Samples: array of Single; { normalized to the range [-1, 1] }
|
||||||
SampleRate: Integer;
|
SampleRate: Integer;
|
||||||
@@ -254,7 +351,6 @@ type
|
|||||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxSamplesArray = array of Single;
|
|
||||||
|
|
||||||
TSherpaOnnxCircularBuffer = class
|
TSherpaOnnxCircularBuffer = class
|
||||||
private
|
private
|
||||||
@@ -508,6 +604,94 @@ type
|
|||||||
|
|
||||||
PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
|
PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineTtsVitsModelConfig = record
|
||||||
|
Model: PAnsiChar;
|
||||||
|
Lexicon: PAnsiChar;
|
||||||
|
Tokens: PAnsiChar;
|
||||||
|
DataDir: PAnsiChar;
|
||||||
|
NoiseScale: cfloat;
|
||||||
|
NoiseScaleW: cfloat;
|
||||||
|
LengthScale: cfloat;
|
||||||
|
DictDir: PAnsiChar;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineTtsModelConfig = record
|
||||||
|
Vits: SherpaOnnxOfflineTtsVitsModelConfig;
|
||||||
|
NumThreads: cint32;
|
||||||
|
Debug: cint32;
|
||||||
|
Provider: PAnsiChar;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxOfflineTtsConfig = record
|
||||||
|
Model: SherpaOnnxOfflineTtsModelConfig;
|
||||||
|
RuleFsts: PAnsiChar;
|
||||||
|
MaxNumSentences: cint32;
|
||||||
|
RuleFars: PAnsiChar;
|
||||||
|
end;
|
||||||
|
|
||||||
|
PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
|
||||||
|
|
||||||
|
SherpaOnnxGeneratedAudio = record
|
||||||
|
Samples: pcfloat;
|
||||||
|
N: cint32;
|
||||||
|
SampleRate: cint32;
|
||||||
|
end;
|
||||||
|
|
||||||
|
PSherpaOnnxGeneratedAudio = ^SherpaOnnxGeneratedAudio;
|
||||||
|
|
||||||
|
SherpaOnnxResampleOut = record
|
||||||
|
Samples: pcfloat;
|
||||||
|
N: cint32;
|
||||||
|
end;
|
||||||
|
|
||||||
|
PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
|
||||||
|
|
||||||
|
function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
|
||||||
|
SampleRateOutHz: cint32;
|
||||||
|
FilterCutoffHz: cfloat;
|
||||||
|
NumZeros: cint32): Pointer; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxDestroyLinearResampler(P: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxLinearResamplerResample(P: Pointer;
|
||||||
|
Samples: pcfloat;
|
||||||
|
N: Integer;
|
||||||
|
Flush: Integer): PSherpaOnnxResampleOut; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxDestroyOfflineTts(Tts: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineTtsSampleRate(Tts: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineTtsNumSpeakers(Tts: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineTtsGenerate(Tts: Pointer;
|
||||||
|
Text: PAnsiChar; Sid: cint32; Speed: cfloat): PSherpaOnnxGeneratedAudio; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Tts: Pointer;
|
||||||
|
Text: PAnsiChar; Sid: cint32; Speed: cfloat;
|
||||||
|
Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
|
||||||
|
Arg: Pointer): PSherpaOnnxGeneratedAudio; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxDestroyOfflineTtsGeneratedAudio(Audio: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
|
function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
|
||||||
BufferSizeInSeconds: cfloat): Pointer; cdecl;
|
BufferSizeInSeconds: cfloat): Pointer; cdecl;
|
||||||
external SherpaOnnxLibName;
|
external SherpaOnnxLibName;
|
||||||
@@ -793,8 +977,7 @@ constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecogniz
|
|||||||
var
|
var
|
||||||
C: SherpaOnnxOnlineRecognizerConfig;
|
C: SherpaOnnxOnlineRecognizerConfig;
|
||||||
begin
|
begin
|
||||||
Initialize(C);
|
C := Default(SherpaOnnxOnlineRecognizerConfig);
|
||||||
|
|
||||||
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
|
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
|
||||||
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
|
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
|
||||||
|
|
||||||
@@ -1051,8 +1234,7 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn
|
|||||||
var
|
var
|
||||||
C: SherpaOnnxOfflineRecognizerConfig;
|
C: SherpaOnnxOfflineRecognizerConfig;
|
||||||
begin
|
begin
|
||||||
Initialize(C);
|
C := Default(SherpaOnnxOfflineRecognizerConfig);
|
||||||
|
|
||||||
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
|
C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
|
||||||
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
|
C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
|
||||||
|
|
||||||
@@ -1369,12 +1551,11 @@ end;
|
|||||||
|
|
||||||
constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
|
constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
|
||||||
var
|
var
|
||||||
C: SherpaOnnxVadModelConfig;
|
C: SherpaOnnxVadModelConfig ;
|
||||||
begin
|
begin
|
||||||
|
C := Default(SherpaOnnxVadModelConfig);
|
||||||
Self._Config := Config;
|
Self._Config := Config;
|
||||||
|
|
||||||
Initialize(C);
|
|
||||||
|
|
||||||
C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
|
C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
|
||||||
C.SileroVad.Threshold := Config.SileroVad.Threshold;
|
C.SileroVad.Threshold := Config.SileroVad.Threshold;
|
||||||
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
|
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
|
||||||
@@ -1460,5 +1641,197 @@ begin
|
|||||||
SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
|
SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
end.
|
function TSherpaOnnxOfflineTtsVitsModelConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxOfflineTtsVitsModelConfig(' +
|
||||||
|
'Model := %s, ' +
|
||||||
|
'Lexicon := %s, ' +
|
||||||
|
'Tokens := %s, ' +
|
||||||
|
'DataDir := %s, ' +
|
||||||
|
'NoiseScale := %.2f, ' +
|
||||||
|
'NoiseScaleW := %.2f, ' +
|
||||||
|
'LengthScale := %.2f, ' +
|
||||||
|
'DictDir := %s' +
|
||||||
|
')',
|
||||||
|
[Self.Model, Self.Lexicon, Self.Tokens, Self.DataDir, Self.NoiseScale,
|
||||||
|
Self.NoiseScaleW, Self.LengthScale, Self.DictDir
|
||||||
|
]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineTtsVitsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.NoiseScale := 0.667;
|
||||||
|
Dest.NoiseScaleW := 0.8;
|
||||||
|
Dest.LengthScale := 1.0;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
|
||||||
|
'Vits := %s, ' +
|
||||||
|
'NumThreads := %d, ' +
|
||||||
|
'Debug := %s, ' +
|
||||||
|
'Provider := %s' +
|
||||||
|
')',
|
||||||
|
[Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider
|
||||||
|
]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineTtsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.NumThreads := 1;
|
||||||
|
Dest.Debug := False;
|
||||||
|
Dest.Provider := 'cpu';
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineTtsConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxOfflineTtsConfig(' +
|
||||||
|
'Model := %s, ' +
|
||||||
|
'RuleFsts := %s, ' +
|
||||||
|
'MaxNumSentences := %d, ' +
|
||||||
|
'RuleFars := %s' +
|
||||||
|
')',
|
||||||
|
[Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars
|
||||||
|
]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
|
||||||
|
begin
|
||||||
|
Dest.MaxNumSentences := 1;
|
||||||
|
end;
|
||||||
|
|
||||||
|
constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
|
||||||
|
var
|
||||||
|
C: SherpaOnnxOfflineTtsConfig;
|
||||||
|
begin
|
||||||
|
C := Default(SherpaOnnxOfflineTtsConfig);
|
||||||
|
Self._Config := Config;
|
||||||
|
|
||||||
|
C.Model.Vits.Model := PAnsiChar(Config.Model.Vits.Model);
|
||||||
|
C.Model.Vits.Lexicon := PAnsiChar(Config.Model.Vits.Lexicon);
|
||||||
|
C.Model.Vits.Tokens := PAnsiChar(Config.Model.Vits.Tokens);
|
||||||
|
C.Model.Vits.DataDir := PAnsiChar(Config.Model.Vits.DataDir);
|
||||||
|
C.Model.Vits.NoiseScale := Config.Model.Vits.NoiseScale;
|
||||||
|
C.Model.Vits.NoiseScaleW := Config.Model.Vits.NoiseScaleW;
|
||||||
|
C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
|
||||||
|
C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir);
|
||||||
|
|
||||||
|
C.Model.NumThreads := Config.Model.NumThreads;
|
||||||
|
C.Model.Provider := PAnsiChar(Config.Model.Provider);
|
||||||
|
C.Model.Debug := Ord(Config.Model.Debug);
|
||||||
|
|
||||||
|
C.RuleFsts := PAnsiChar(Config.RuleFsts);
|
||||||
|
C.MaxNumSentences := Config.MaxNumSentences;
|
||||||
|
C.RuleFars := PAnsiChar(Config.RuleFars);
|
||||||
|
|
||||||
|
Self.Handle := SherpaOnnxCreateOfflineTts(@C);
|
||||||
|
|
||||||
|
Self.SampleRate := SherpaOnnxOfflineTtsSampleRate(Self.Handle);
|
||||||
|
Self.NumSpeakers := SherpaOnnxOfflineTtsNumSpeakers(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
destructor TSherpaOnnxOfflineTts.Destroy;
|
||||||
|
begin
|
||||||
|
SherpaOnnxDestroyOfflineTts(Self.Handle);
|
||||||
|
Self.Handle := nil;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
|
||||||
|
Speed: Single): TSherpaOnnxGeneratedAudio;
|
||||||
|
var
|
||||||
|
Audio: PSherpaOnnxGeneratedAudio;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
Result := Default(TSherpaOnnxGeneratedAudio);
|
||||||
|
|
||||||
|
Audio := SherpaOnnxOfflineTtsGenerate(Self.Handle, PAnsiChar(Text), SpeakerId, Speed);
|
||||||
|
|
||||||
|
SetLength(Result.Samples, Audio^.N);
|
||||||
|
Result.SampleRate := Audio^.SampleRate;
|
||||||
|
|
||||||
|
for I := Low(Result.Samples) to High(Result.Samples) do
|
||||||
|
begin
|
||||||
|
Result.Samples[I] := Audio^.Samples[I];
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
|
||||||
|
Speed: Single;
|
||||||
|
Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
|
||||||
|
Arg: Pointer
|
||||||
|
): TSherpaOnnxGeneratedAudio;
|
||||||
|
var
|
||||||
|
Audio: PSherpaOnnxGeneratedAudio;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
Result := Default(TSherpaOnnxGeneratedAudio);
|
||||||
|
|
||||||
|
Audio := SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Self.Handle, PAnsiChar(Text),
|
||||||
|
SpeakerId, Speed, Callback, Arg);
|
||||||
|
|
||||||
|
SetLength(Result.Samples, Audio^.N);
|
||||||
|
Result.SampleRate := Audio^.SampleRate;
|
||||||
|
|
||||||
|
for I := Low(Result.Samples) to High(Result.Samples) do
|
||||||
|
begin
|
||||||
|
Result.Samples[I] := Audio^.Samples[I];
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
|
||||||
|
end;
|
||||||
|
|
||||||
|
constructor TSherpaOnnxLinearResampler.Create(SampleRateIn: Integer; SampleRateOut: Integer);
|
||||||
|
var
|
||||||
|
MinFreq: Single;
|
||||||
|
LowpassCutoff: Single;
|
||||||
|
LowpassFilterWidth: Integer = 6;
|
||||||
|
begin
|
||||||
|
if SampleRateIn > SampleRateOut then
|
||||||
|
MinFreq := SampleRateOut
|
||||||
|
else
|
||||||
|
MinFreq := SampleRateIn;
|
||||||
|
|
||||||
|
LowpassCutoff := 0.99 * 0.5 * MinFreq;
|
||||||
|
|
||||||
|
Self.Handle := SherpaOnnxCreateLinearResampler(SampleRateIn,
|
||||||
|
SampleRateOut, LowpassCutoff, LowpassFilterWidth);
|
||||||
|
Self.InputSampleRate := SampleRateIn;
|
||||||
|
Self.OutputSampleRate := SampleRateOut;
|
||||||
|
end;
|
||||||
|
|
||||||
|
destructor TSherpaOnnxLinearResampler.Destroy;
|
||||||
|
begin
|
||||||
|
SherpaOnnxDestroyLinearResampler(Self.Handle);
|
||||||
|
Self.Handle := nil;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxLinearResampler.Resample(Samples: pcfloat;
|
||||||
|
N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray;
|
||||||
|
var
|
||||||
|
P: PSherpaOnnxResampleOut;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
Result := Default(TSherpaOnnxSamplesArray);
|
||||||
|
P := SherpaOnnxLinearResamplerResample(Self.Handle, Samples, N, Ord(Flush));
|
||||||
|
SetLength(Result, P^.N);
|
||||||
|
|
||||||
|
for I := Low(Result) to High(Result) do
|
||||||
|
Result[I] := P^.Samples[I];
|
||||||
|
|
||||||
|
SherpaOnnxLinearResamplerResampleFree(P);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxLinearResampler.Resample(Samples: array of Single; Flush: Boolean): TSherpaOnnxSamplesArray;
|
||||||
|
begin
|
||||||
|
Result := Self.Resample(pcfloat(Samples), Length(Samples), Flush);
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxLinearResampler.Reset;
|
||||||
|
begin
|
||||||
|
SherpaOnnxLinearResamplerReset(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
end.
|
||||||
|
|||||||
Reference in New Issue
Block a user