enginex-mr_series-sherpa-onnx/pascal-api-examples/tts/piper-playback.pas

{ Copyright (c)  2024  Xiaomi Corporation }
program piper_playback;
{
This file shows how to use the text to speech API of sherpa-onnx
with Piper models.

It generates speech from text and saves it to a wave file.

Note that it plays the audio back as it is still generating.
}

{$mode objfpc}

uses
  {$ifdef unix}
  cthreads,
  {$endif}
  SysUtils,
  dos,
  ctypes,
  portaudio,
  sherpa_onnx;

var
  CriticalSection: TRTLCriticalSection;

  Tts: TSherpaOnnxOfflineTts;
  Audio: TSherpaOnnxGeneratedAudio;
  Resampler: TSherpaOnnxLinearResampler;

  Text: AnsiString;
  Speed: Single = 1.0;  {Use a larger value to speak faster}
  SpeakerId: Integer = 0;
  Buffer: TSherpaOnnxCircularBuffer;
  FinishedGeneration: Boolean = False;
  FinishedPlaying: Boolean = False;

  Version: String;
  EnvStr: String;
  Status: Integer;
  NumDevices: Integer;
  DeviceIndex: Integer;
  DeviceInfo: PPaDeviceInfo;

  { If you get EDivByZero: Division by zero error, please change the sample rate
    to the one supported by your microphone.
  }
  DeviceSampleRate: Integer = 48000;
  I: Integer;
  Param: TPaStreamParameters;
  Stream: PPaStream;
  Wave: TSherpaOnnxWave;

function GenerateCallback(
      Samples: pcfloat; N: cint32;
      Arg: Pointer): cint; cdecl;
begin
  EnterCriticalSection(CriticalSection);
  try
    if Resampler <> nil then
      Buffer.Push(Resampler.Resample(Samples, N, False))
    else
      Buffer.Push(Samples, N);
  finally
    LeaveCriticalSection(CriticalSection);
  end;

  { 1 means to continue generating; 0 means to stop generating. }
  Result := 1;
end;

function PlayCallback(
      input: Pointer; output: Pointer;
      frameCount: culong;
      timeInfo: PPaStreamCallbackTimeInfo;
      statusFlags: TPaStreamCallbackFlags;
      userData: Pointer ): cint; cdecl;
var
  Samples: TSherpaOnnxSamplesArray;
  I: Integer;
begin
  EnterCriticalSection(CriticalSection);
  try
    if Buffer.Size >= frameCount then
      begin
        Samples := Buffer.Get(Buffer.Head, FrameCount);
        Buffer.Pop(FrameCount);
      end
    else if Buffer.Size > 0 then
      begin
        Samples := Buffer.Get(Buffer.Head, Buffer.Size);
        Buffer.Pop(Buffer.Size);
        SetLength(Samples, frameCount);
      end
    else
      SetLength(Samples, frameCount);

    for I := 0 to frameCount - 1 do
      pcfloat(output)[I] := Samples[I];

    if (Buffer.Size > 0) or (not FinishedGeneration) then
      Result := paContinue
    else
      begin
        Result := paComplete;
        FinishedPlaying := True;
      end;
  finally
    LeaveCriticalSection(CriticalSection);
  end;
end;

function GetOfflineTts: TSherpaOnnxOfflineTts;
var
  Config: TSherpaOnnxOfflineTtsConfig;
begin
  Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
  Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
  Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
  Config.Model.NumThreads := 1;
  Config.Model.Debug := False;
  Config.MaxNumSentences := 1;

  Result := TSherpaOnnxOfflineTts.Create(Config);
end;

begin
  Tts := GetOfflineTts;
  if Tts.GetSampleRate <> DeviceSampleRate then
    Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);

  Version := String(Pa_GetVersionText);
  WriteLn('Version is ', Version);
  Status := Pa_Initialize;
  if Status <> paNoError then
    begin
      WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;

  NumDevices := Pa_GetDeviceCount;
  WriteLn('Num devices: ', NumDevices);

  DeviceIndex := Pa_GetDefaultOutputDevice;

  if DeviceIndex = paNoDevice then
    begin
      WriteLn('No default output device found');
      Pa_Terminate;
      Exit;
    end;

  EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
  if EnvStr <> '' then
    begin
      DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
      WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
    end;

  for I := 0 to (NumDevices - 1) do
    begin
      DeviceInfo := Pa_GetDeviceInfo(I);
      if I = DeviceIndex then
        { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
        WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
      else
        WriteLn(Format('   %d %s', [I, AnsiString(DeviceInfo^.Name)]));
    end;

  WriteLn('Use device ', DeviceIndex);
  WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
  WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);

  Initialize(Param);
  Param.Device := DeviceIndex;
  Param.ChannelCount := 1;
  Param.SampleFormat := paFloat32;
  param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
  param.HostApiSpecificStreamInfo := nil;

  Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);


  { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
  Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
    PPaStreamCallback(@PlayCallback), nil);

  if Status <> paNoError then
    begin
      WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;

  InitCriticalSection(CriticalSection);

  Status := Pa_StartStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;

  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');

  Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';

  Audio :=  Tts.Generate(Text, SpeakerId, Speed,
    PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
  FinishedGeneration := True;
  SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
  WriteLn('Saved to ./libritts_r-generated.wav');

  while not FinishedPlaying do
    Pa_Sleep(100);  {sleep for 0.1 second }
    {TODO(fangjun): Use an event to indicate the play is finished}

  DoneCriticalSection(CriticalSection);

  FreeAndNil(Tts);
  FreeAndNil(Resampler);

  Status := Pa_CloseStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
      Exit;
    end;

  Status := Pa_Terminate;
  if Status <> paNoError then
    begin
      WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;
end.
Text to speech API for Object Pascal. (#1273) 2024-08-20 20:52:16 +08:00			`{ Copyright (c) 2024 Xiaomi Corporation }`
Add Pascal API for MatchaTTS models. (#1686) 2025-01-06 10:04:35 +08:00			`program piper_playback;`
Text to speech API for Object Pascal. (#1273) 2024-08-20 20:52:16 +08:00			`{`
			`This file shows how to use the text to speech API of sherpa-onnx`
			`with Piper models.`

			`It generates speech from text and saves it to a wave file.`

			`Note that it plays the audio back as it is still generating.`
			`}`

			`{$mode objfpc}`

			`uses`
			`{$ifdef unix}`
			`cthreads,`
			`{$endif}`
			`SysUtils,`
			`dos,`
			`ctypes,`
			`portaudio,`
			`sherpa_onnx;`

			`var`
			`CriticalSection: TRTLCriticalSection;`

			`Tts: TSherpaOnnxOfflineTts;`
			`Audio: TSherpaOnnxGeneratedAudio;`
			`Resampler: TSherpaOnnxLinearResampler;`

			`Text: AnsiString;`
			`Speed: Single = 1.0; {Use a larger value to speak faster}`
			`SpeakerId: Integer = 0;`
			`Buffer: TSherpaOnnxCircularBuffer;`
			`FinishedGeneration: Boolean = False;`
			`FinishedPlaying: Boolean = False;`

			`Version: String;`
			`EnvStr: String;`
			`Status: Integer;`
			`NumDevices: Integer;`
			`DeviceIndex: Integer;`
			`DeviceInfo: PPaDeviceInfo;`

			`{ If you get EDivByZero: Division by zero error, please change the sample rate`
			`to the one supported by your microphone.`
			`}`
			`DeviceSampleRate: Integer = 48000;`
			`I: Integer;`
			`Param: TPaStreamParameters;`
			`Stream: PPaStream;`
			`Wave: TSherpaOnnxWave;`

			`function GenerateCallback(`
			`Samples: pcfloat; N: cint32;`
			`Arg: Pointer): cint; cdecl;`
			`begin`
			`EnterCriticalSection(CriticalSection);`
			`try`
			`if Resampler <> nil then`
			`Buffer.Push(Resampler.Resample(Samples, N, False))`
			`else`
			`Buffer.Push(Samples, N);`
			`finally`
			`LeaveCriticalSection(CriticalSection);`
			`end;`

			`{ 1 means to continue generating; 0 means to stop generating. }`
			`Result := 1;`
			`end;`

			`function PlayCallback(`
			`input: Pointer; output: Pointer;`
			`frameCount: culong;`
			`timeInfo: PPaStreamCallbackTimeInfo;`
			`statusFlags: TPaStreamCallbackFlags;`
			`userData: Pointer ): cint; cdecl;`
			`var`
			`Samples: TSherpaOnnxSamplesArray;`
			`I: Integer;`
			`begin`
			`EnterCriticalSection(CriticalSection);`
			`try`
			`if Buffer.Size >= frameCount then`
			`begin`
			`Samples := Buffer.Get(Buffer.Head, FrameCount);`
			`Buffer.Pop(FrameCount);`
			`end`
			`else if Buffer.Size > 0 then`
			`begin`
			`Samples := Buffer.Get(Buffer.Head, Buffer.Size);`
			`Buffer.Pop(Buffer.Size);`
			`SetLength(Samples, frameCount);`
			`end`
			`else`
			`SetLength(Samples, frameCount);`

			`for I := 0 to frameCount - 1 do`
			`pcfloat(output)[I] := Samples[I];`

			`if (Buffer.Size > 0) or (not FinishedGeneration) then`
			`Result := paContinue`
			`else`
			`begin`
			`Result := paComplete;`
			`FinishedPlaying := True;`
			`end;`
			`finally`
			`LeaveCriticalSection(CriticalSection);`
			`end;`
			`end;`

			`function GetOfflineTts: TSherpaOnnxOfflineTts;`
			`var`
			`Config: TSherpaOnnxOfflineTtsConfig;`
			`begin`
			`Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';`
			`Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';`
			`Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';`
			`Config.Model.NumThreads := 1;`
			`Config.Model.Debug := False;`
			`Config.MaxNumSentences := 1;`

			`Result := TSherpaOnnxOfflineTts.Create(Config);`
			`end;`

			`begin`
			`Tts := GetOfflineTts;`
			`if Tts.GetSampleRate <> DeviceSampleRate then`
			`Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);`

			`Version := String(Pa_GetVersionText);`
			`WriteLn('Version is ', Version);`
			`Status := Pa_Initialize;`
			`if Status <> paNoError then`
			`begin`
			`WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));`
			`Exit;`
			`end;`

			`NumDevices := Pa_GetDeviceCount;`
			`WriteLn('Num devices: ', NumDevices);`

			`DeviceIndex := Pa_GetDefaultOutputDevice;`

			`if DeviceIndex = paNoDevice then`
			`begin`
			`WriteLn('No default output device found');`
			`Pa_Terminate;`
			`Exit;`
			`end;`

			`EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');`
			`if EnvStr <> '' then`
			`begin`
			`DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);`
			`WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);`
			`end;`

			`for I := 0 to (NumDevices - 1) do`
			`begin`
			`DeviceInfo := Pa_GetDeviceInfo(I);`
			`if I = DeviceIndex then`
			`{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }`
			`WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))`
			`else`
			`WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));`
			`end;`

			`WriteLn('Use device ', DeviceIndex);`
			`WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);`
			`WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);`

			`Initialize(Param);`
			`Param.Device := DeviceIndex;`
			`Param.ChannelCount := 1;`
			`Param.SampleFormat := paFloat32;`
			`param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;`
			`param.HostApiSpecificStreamInfo := nil;`

			`Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);`


			`{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }`
			`Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,`
			`PPaStreamCallback(@PlayCallback), nil);`

			`if Status <> paNoError then`
			`begin`
			`WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));`
			`Pa_Terminate;`
			`Exit;`
			`end;`

			`InitCriticalSection(CriticalSection);`

			`Status := Pa_StartStream(stream);`
			`if Status <> paNoError then`
			`begin`
			`WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));`
			`Pa_Terminate;`
			`Exit;`
			`end;`

			`WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');`

			`Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';`

			`Audio := Tts.Generate(Text, SpeakerId, Speed,`
			`PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);`
			`FinishedGeneration := True;`
			`SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);`
			`WriteLn('Saved to ./libritts_r-generated.wav');`

			`while not FinishedPlaying do`
			`Pa_Sleep(100); {sleep for 0.1 second }`
			`{TODO(fangjun): Use an event to indicate the play is finished}`

			`DoneCriticalSection(CriticalSection);`

			`FreeAndNil(Tts);`
			`FreeAndNil(Resampler);`

			`Status := Pa_CloseStream(stream);`
			`if Status <> paNoError then`
			`begin`
			`WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));`
			`Exit;`
			`end;`

			`Status := Pa_Terminate;`
			`if Status <> paNoError then`
			`begin`
			`WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));`
			`Exit;`
			`end;`
			`end.`