2024-08-20 20:52:16 +08:00
{ Copyright (c) 2024 Xiaomi Corporation }
2025-01-06 10:04:35 +08:00
program piper_playback;
2024-08-20 20:52:16 +08:00
{
This file shows how to use the text to speech API of sherpa- onnx
with Piper models.
It generates speech from text and saves it to a wave file .
Note that it plays the audio back as it is still generating.
}
{$mode objfpc}
uses
{$ifdef unix}
cthreads,
{$endif}
SysUtils,
dos,
ctypes,
portaudio,
sherpa_onnx;
var
CriticalSection: TRTLCriticalSection;
Tts: TSherpaOnnxOfflineTts;
Audio: TSherpaOnnxGeneratedAudio;
Resampler: TSherpaOnnxLinearResampler;
Text : AnsiString ;
Speed: Single = 1.0 ; {Use a larger value to speak faster}
SpeakerId: Integer = 0 ;
Buffer: TSherpaOnnxCircularBuffer;
FinishedGeneration: Boolean = False ;
FinishedPlaying: Boolean = False ;
Version: String ;
EnvStr: String ;
Status: Integer ;
NumDevices: Integer ;
DeviceIndex: Integer ;
DeviceInfo: PPaDeviceInfo;
{ If you get EDivByZero: Division by zero error, please change the sample rate
to the one supported by your microphone.
}
DeviceSampleRate: Integer = 4 8 0 0 0 ;
I: Integer ;
Param: TPaStreamParameters;
Stream: PPaStream;
Wave: TSherpaOnnxWave;
function GenerateCallback(
Samples: pcfloat; N: cint32;
Arg: Pointer ) : cint; cdecl ;
begin
EnterCriticalSection( CriticalSection) ;
try
if Resampler < > nil then
Buffer. Push( Resampler. Resample( Samples, N, False ) )
else
Buffer. Push( Samples, N) ;
finally
LeaveCriticalSection( CriticalSection) ;
end ;
{ 1 means to continue generating; 0 means to stop generating. }
Result : = 1 ;
end ;
function PlayCallback(
input: Pointer ; output: Pointer ;
frameCount: culong;
timeInfo: PPaStreamCallbackTimeInfo;
statusFlags: TPaStreamCallbackFlags;
userData: Pointer ) : cint; cdecl ;
var
Samples: TSherpaOnnxSamplesArray;
I: Integer ;
begin
EnterCriticalSection( CriticalSection) ;
try
if Buffer. Size > = frameCount then
begin
Samples : = Buffer. Get( Buffer. Head, FrameCount) ;
Buffer. Pop( FrameCount) ;
end
else if Buffer. Size > 0 then
begin
Samples : = Buffer. Get( Buffer. Head, Buffer. Size) ;
Buffer. Pop( Buffer. Size) ;
SetLength( Samples, frameCount) ;
end
else
SetLength( Samples, frameCount) ;
for I : = 0 to frameCount - 1 do
pcfloat( output) [ I] : = Samples[ I] ;
if ( Buffer. Size > 0 ) or ( not FinishedGeneration) then
Result : = paContinue
else
begin
Result : = paComplete;
FinishedPlaying : = True ;
end ;
finally
LeaveCriticalSection( CriticalSection) ;
end ;
end ;
function GetOfflineTts: TSherpaOnnxOfflineTts;
var
Config: TSherpaOnnxOfflineTtsConfig;
begin
Config. Model. Vits. Model : = './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx' ;
Config. Model. Vits. Tokens : = './vits-piper-en_US-libritts_r-medium/tokens.txt' ;
Config. Model. Vits. DataDir : = './vits-piper-en_US-libritts_r-medium/espeak-ng-data' ;
Config. Model. NumThreads : = 1 ;
Config. Model. Debug : = False ;
Config. MaxNumSentences : = 1 ;
Result : = TSherpaOnnxOfflineTts. Create( Config) ;
end ;
begin
Tts : = GetOfflineTts;
if Tts. GetSampleRate < > DeviceSampleRate then
Resampler : = TSherpaOnnxLinearResampler. Create( Tts. GetSampleRate, DeviceSampleRate) ;
Version : = String( Pa_GetVersionText) ;
WriteLn( 'Version is ' , Version) ;
Status : = Pa_Initialize;
if Status < > paNoError then
begin
WriteLn( 'Failed to initialize portaudio, ' , Pa_GetErrorText( Status) ) ;
Exit;
end ;
NumDevices : = Pa_GetDeviceCount;
WriteLn( 'Num devices: ' , NumDevices) ;
DeviceIndex : = Pa_GetDefaultOutputDevice;
if DeviceIndex = paNoDevice then
begin
WriteLn( 'No default output device found' ) ;
Pa_Terminate;
Exit;
end ;
EnvStr : = GetEnv( 'SHERPA_ONNX_MIC_DEVICE' ) ;
if EnvStr < > '' then
begin
DeviceIndex : = StrToIntDef( EnvStr, DeviceIndex) ;
WriteLn( 'Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ' , EnvStr) ;
end ;
for I : = 0 to ( NumDevices - 1 ) do
begin
DeviceInfo : = Pa_GetDeviceInfo( I) ;
if I = DeviceIndex then
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
WriteLn( Format( ' * %d %s' , [ I, AnsiString( DeviceInfo^ . Name ) ] ) )
else
WriteLn( Format( ' %d %s' , [ I, AnsiString( DeviceInfo^ . Name ) ] ) ) ;
end ;
WriteLn( 'Use device ' , DeviceIndex) ;
WriteLn( ' Name ' , Pa_GetDeviceInfo( DeviceIndex) ^ . Name ) ;
WriteLn( ' Max output channels ' , Pa_GetDeviceInfo( DeviceIndex) ^ . MaxOutputChannels) ;
Initialize( Param) ;
Param. Device : = DeviceIndex;
Param. ChannelCount : = 1 ;
Param. SampleFormat : = paFloat32;
param. SuggestedLatency : = Pa_GetDeviceInfo( DeviceIndex) ^ . DefaultHighOutputLatency;
param. HostApiSpecificStreamInfo : = nil ;
Buffer : = TSherpaOnnxCircularBuffer. Create( 3 0 * DeviceSampleRate) ;
{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
Status : = Pa_OpenStream( stream, nil , @ Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
PPaStreamCallback( @ PlayCallback) , nil ) ;
if Status < > paNoError then
begin
WriteLn( 'Failed to open stream, ' , Pa_GetErrorText( Status) ) ;
Pa_Terminate;
Exit;
end ;
InitCriticalSection( CriticalSection) ;
Status : = Pa_StartStream( stream) ;
if Status < > paNoError then
begin
WriteLn( 'Failed to start stream, ' , Pa_GetErrorText( Status) ) ;
Pa_Terminate;
Exit;
end ;
WriteLn( 'There are ' , Tts. GetNumSpeakers, ' speakers' ) ;
Text : = 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' ;
Audio : = Tts. Generate( Text , SpeakerId, Speed,
PSherpaOnnxGeneratedAudioCallbackWithArg( @ GenerateCallback) , nil ) ;
FinishedGeneration : = True ;
SherpaOnnxWriteWave( './libritts_r-generated.wav' , Audio. Samples, Audio. SampleRate) ;
WriteLn( 'Saved to ./libritts_r-generated.wav' ) ;
while not FinishedPlaying do
Pa_Sleep( 1 0 0 ) ; {sleep for 0.1 second }
{TODO(fangjun): Use an event to indicate the play is finished}
DoneCriticalSection( CriticalSection) ;
FreeAndNil( Tts) ;
FreeAndNil( Resampler) ;
Status : = Pa_CloseStream( stream) ;
if Status < > paNoError then
begin
WriteLn( 'Failed to close stream, ' , Pa_GetErrorText( Status) ) ;
Exit;
end ;
Status : = Pa_Terminate;
if Status < > paNoError then
begin
WriteLn( 'Failed to deinitialize portaudio, ' , Pa_GetErrorText( Status) ) ;
Exit;
end ;
end .