Add Pascal API for MatchaTTS models. (#1686)

2025-01-06 10:04:35 +08:00
parent 46330b25cc
commit c6fcd32552
12 changed files with 875 additions and 3 deletions
--- a/.github/workflows/pascal.yaml
+++ b/.github/workflows/pascal.yaml
@@ -152,6 +152,19 @@ jobs:
          ./run-piper.sh
          rm -rf vits-piper-*
          rm piper
          ls -lh
          echo "---"
          ./run-matcha-zh.sh
          rm -rf matcha-icefall-*
          rm matcha-zh
          ls -lh
          echo "---"
          ./run-matcha-en.sh
          rm -rf matcha-icefall-*
          rm matcha-en
          ls -lh
          echo "---"
--- a/pascal-api-examples/tts/.gitignore
+++ b/pascal-api-examples/tts/.gitignore
@@ -2,3 +2,7 @@
 piper
 piper-playback
 link*.res
 matcha-zh
 matcha-en
 matcha-zh-playback
 matcha-en-playback
--- a/pascal-api-examples/tts/matcha-en-playback.pas
+++ b/pascal-api-examples/tts/matcha-en-playback.pas
@@ -0,0 +1,239 @@
 { Copyright (c)  2025  Xiaomi Corporation }
 program matcha_en_playback;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with Piper models.
 It generates speech from text and saves it to a wave file.
 Note that it plays the audio back as it is still generating.
 }
 {$mode objfpc}
 uses
  {$ifdef unix}
  cthreads,
  {$endif}
  SysUtils,
  dos,
  ctypes,
  portaudio,
  sherpa_onnx;
 var
  CriticalSection: TRTLCriticalSection;
  Tts: TSherpaOnnxOfflineTts;
  Audio: TSherpaOnnxGeneratedAudio;
  Resampler: TSherpaOnnxLinearResampler;
  Text: AnsiString;
  Speed: Single = 1.0;  {Use a larger value to speak faster}
  SpeakerId: Integer = 0;
  Buffer: TSherpaOnnxCircularBuffer;
  FinishedGeneration: Boolean = False;
  FinishedPlaying: Boolean = False;
  Version: String;
  EnvStr: String;
  Status: Integer;
  NumDevices: Integer;
  DeviceIndex: Integer;
  DeviceInfo: PPaDeviceInfo;
  { If you get EDivByZero: Division by zero error, please change the sample rate
    to the one supported by your microphone.
  }
  DeviceSampleRate: Integer = 48000;
  I: Integer;
  Param: TPaStreamParameters;
  Stream: PPaStream;
  Wave: TSherpaOnnxWave;
 function GenerateCallback(
      Samples: pcfloat; N: cint32;
      Arg: Pointer): cint; cdecl;
 begin
  EnterCriticalSection(CriticalSection);
  try
    if Resampler <> nil then
      Buffer.Push(Resampler.Resample(Samples, N, False))
    else
      Buffer.Push(Samples, N);
  finally
    LeaveCriticalSection(CriticalSection);
  end;
  { 1 means to continue generating; 0 means to stop generating. }
  Result := 1;
 end;
 function PlayCallback(
      input: Pointer; output: Pointer;
      frameCount: culong;
      timeInfo: PPaStreamCallbackTimeInfo;
      statusFlags: TPaStreamCallbackFlags;
      userData: Pointer ): cint; cdecl;
 var
  Samples: TSherpaOnnxSamplesArray;
  I: Integer;
 begin
  EnterCriticalSection(CriticalSection);
  try
    if Buffer.Size >= frameCount then
      begin
        Samples := Buffer.Get(Buffer.Head, FrameCount);
        Buffer.Pop(FrameCount);
      end
    else if Buffer.Size > 0 then
      begin
        Samples := Buffer.Get(Buffer.Head, Buffer.Size);
        Buffer.Pop(Buffer.Size);
        SetLength(Samples, frameCount);
      end
    else
      SetLength(Samples, frameCount);
    for I := 0 to frameCount - 1 do
      pcfloat(output)[I] := Samples[I];
    if (Buffer.Size > 0) or (not FinishedGeneration) then
      Result := paContinue
    else
      begin
        Result := paComplete;
        FinishedPlaying := True;
      end;
  finally
    LeaveCriticalSection(CriticalSection);
  end;
 end;
 function GetOfflineTts: TSherpaOnnxOfflineTts;
 var
  Config: TSherpaOnnxOfflineTtsConfig;
 begin
  Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx';
  Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt';
  Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data';
  Config.Model.NumThreads := 1;
  Config.Model.Debug := False;
  Config.MaxNumSentences := 1;
  Result := TSherpaOnnxOfflineTts.Create(Config);
 end;
 begin
  Tts := GetOfflineTts;
  if Tts.GetSampleRate <> DeviceSampleRate then
    Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
  Version := String(Pa_GetVersionText);
  WriteLn('Version is ', Version);
  Status := Pa_Initialize;
  if Status <> paNoError then
    begin
      WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;
  NumDevices := Pa_GetDeviceCount;
  WriteLn('Num devices: ', NumDevices);
  DeviceIndex := Pa_GetDefaultOutputDevice;
  if DeviceIndex = paNoDevice then
    begin
      WriteLn('No default output device found');
      Pa_Terminate;
      Exit;
    end;
  EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
  if EnvStr <> '' then
    begin
      DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
      WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
    end;
  for I := 0 to (NumDevices - 1) do
    begin
      DeviceInfo := Pa_GetDeviceInfo(I);
      if I = DeviceIndex then
        { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
        WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
      else
        WriteLn(Format('   %d %s', [I, AnsiString(DeviceInfo^.Name)]));
    end;
  WriteLn('Use device ', DeviceIndex);
  WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
  WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
  Initialize(Param);
  Param.Device := DeviceIndex;
  Param.ChannelCount := 1;
  Param.SampleFormat := paFloat32;
  param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
  param.HostApiSpecificStreamInfo := nil;
  Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
  { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
  Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
    PPaStreamCallback(@PlayCallback), nil);
  if Status <> paNoError then
    begin
      WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;
  InitCriticalSection(CriticalSection);
  Status := Pa_StartStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;
  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
  Audio :=  Tts.Generate(Text, SpeakerId, Speed,
    PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
  FinishedGeneration := True;
  SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
  WriteLn('Saved to ./matcha-zh-playback.wav');
  while not FinishedPlaying do
    Pa_Sleep(100);  {sleep for 0.1 second }
    {TODO(fangjun): Use an event to indicate the play is finished}
  DoneCriticalSection(CriticalSection);
  FreeAndNil(Tts);
  FreeAndNil(Resampler);
  Status := Pa_CloseStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
      Exit;
    end;
  Status := Pa_Terminate;
  if Status <> paNoError then
    begin
      WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;
 end.
--- a/pascal-api-examples/tts/matcha-en.pas
+++ b/pascal-api-examples/tts/matcha-en.pas
@@ -0,0 +1,55 @@
 { Copyright (c)  2025  Xiaomi Corporation }
 program matcha_en;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with MatchaTTS models.
 It generates speech from text and saves it to a wave file.
 If you want to play it while it is generating, please see
 ./matcha-zh-playback.pas
 }
 {$mode objfpc}
 uses
  SysUtils,
  sherpa_onnx;
 function GetOfflineTts: TSherpaOnnxOfflineTts;
 var
  Config: TSherpaOnnxOfflineTtsConfig;
 begin
  Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx';
  Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt';
  Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data';
  Config.Model.NumThreads := 1;
  Config.Model.Debug := False;
  Config.MaxNumSentences := 1;
  Result := TSherpaOnnxOfflineTts.Create(Config);
 end;
 var
  Tts: TSherpaOnnxOfflineTts;
  Audio: TSherpaOnnxGeneratedAudio;
  Text: AnsiString;
  Speed: Single = 1.0;  {Use a larger value to speak faster}
  SpeakerId: Integer = 0;
 begin
  Tts := GetOfflineTts;
  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.';
  Audio :=  Tts.Generate(Text, SpeakerId, Speed);
  SherpaOnnxWriteWave('./matcha-en.wav', Audio.Samples, Audio.SampleRate);
  WriteLn('Saved to ./matcha-en.wav');
  FreeAndNil(Tts);
 end.
--- a/pascal-api-examples/tts/matcha-zh-playback.pas
+++ b/pascal-api-examples/tts/matcha-zh-playback.pas
@@ -0,0 +1,241 @@
 { Copyright (c)  2025  Xiaomi Corporation }
 program matcha_zh_playback;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with Piper models.
 It generates speech from text and saves it to a wave file.
 Note that it plays the audio back as it is still generating.
 }
 {$mode objfpc}
 uses
  {$ifdef unix}
  cthreads,
  {$endif}
  SysUtils,
  dos,
  ctypes,
  portaudio,
  sherpa_onnx;
 var
  CriticalSection: TRTLCriticalSection;
  Tts: TSherpaOnnxOfflineTts;
  Audio: TSherpaOnnxGeneratedAudio;
  Resampler: TSherpaOnnxLinearResampler;
  Text: AnsiString;
  Speed: Single = 1.0;  {Use a larger value to speak faster}
  SpeakerId: Integer = 0;
  Buffer: TSherpaOnnxCircularBuffer;
  FinishedGeneration: Boolean = False;
  FinishedPlaying: Boolean = False;
  Version: String;
  EnvStr: String;
  Status: Integer;
  NumDevices: Integer;
  DeviceIndex: Integer;
  DeviceInfo: PPaDeviceInfo;
  { If you get EDivByZero: Division by zero error, please change the sample rate
    to the one supported by your microphone.
  }
  DeviceSampleRate: Integer = 48000;
  I: Integer;
  Param: TPaStreamParameters;
  Stream: PPaStream;
  Wave: TSherpaOnnxWave;
 function GenerateCallback(
      Samples: pcfloat; N: cint32;
      Arg: Pointer): cint; cdecl;
 begin
  EnterCriticalSection(CriticalSection);
  try
    if Resampler <> nil then
      Buffer.Push(Resampler.Resample(Samples, N, False))
    else
      Buffer.Push(Samples, N);
  finally
    LeaveCriticalSection(CriticalSection);
  end;
  { 1 means to continue generating; 0 means to stop generating. }
  Result := 1;
 end;
 function PlayCallback(
      input: Pointer; output: Pointer;
      frameCount: culong;
      timeInfo: PPaStreamCallbackTimeInfo;
      statusFlags: TPaStreamCallbackFlags;
      userData: Pointer ): cint; cdecl;
 var
  Samples: TSherpaOnnxSamplesArray;
  I: Integer;
 begin
  EnterCriticalSection(CriticalSection);
  try
    if Buffer.Size >= frameCount then
      begin
        Samples := Buffer.Get(Buffer.Head, FrameCount);
        Buffer.Pop(FrameCount);
      end
    else if Buffer.Size > 0 then
      begin
        Samples := Buffer.Get(Buffer.Head, Buffer.Size);
        Buffer.Pop(Buffer.Size);
        SetLength(Samples, frameCount);
      end
    else
      SetLength(Samples, frameCount);
    for I := 0 to frameCount - 1 do
      pcfloat(output)[I] := Samples[I];
    if (Buffer.Size > 0) or (not FinishedGeneration) then
      Result := paContinue
    else
      begin
        Result := paComplete;
        FinishedPlaying := True;
      end;
  finally
    LeaveCriticalSection(CriticalSection);
  end;
 end;
 function GetOfflineTts: TSherpaOnnxOfflineTts;
 var
  Config: TSherpaOnnxOfflineTtsConfig;
 begin
  Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx';
  Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt';
  Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt';
  Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict';
  Config.Model.NumThreads := 1;
  Config.Model.Debug := False;
  Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst';
  Config.MaxNumSentences := 1;
  Result := TSherpaOnnxOfflineTts.Create(Config);
 end;
 begin
  Tts := GetOfflineTts;
  if Tts.GetSampleRate <> DeviceSampleRate then
    Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
  Version := String(Pa_GetVersionText);
  WriteLn('Version is ', Version);
  Status := Pa_Initialize;
  if Status <> paNoError then
    begin
      WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;
  NumDevices := Pa_GetDeviceCount;
  WriteLn('Num devices: ', NumDevices);
  DeviceIndex := Pa_GetDefaultOutputDevice;
  if DeviceIndex = paNoDevice then
    begin
      WriteLn('No default output device found');
      Pa_Terminate;
      Exit;
    end;
  EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
  if EnvStr <> '' then
    begin
      DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
      WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
    end;
  for I := 0 to (NumDevices - 1) do
    begin
      DeviceInfo := Pa_GetDeviceInfo(I);
      if I = DeviceIndex then
        { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
        WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
      else
        WriteLn(Format('   %d %s', [I, AnsiString(DeviceInfo^.Name)]));
    end;
  WriteLn('Use device ', DeviceIndex);
  WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
  WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
  Initialize(Param);
  Param.Device := DeviceIndex;
  Param.ChannelCount := 1;
  Param.SampleFormat := paFloat32;
  param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
  param.HostApiSpecificStreamInfo := nil;
  Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
  { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
  Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
    PPaStreamCallback(@PlayCallback), nil);
  if Status <> paNoError then
    begin
      WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;
  InitCriticalSection(CriticalSection);
  Status := Pa_StartStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;
  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  Text := '某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。';
  Audio :=  Tts.Generate(Text, SpeakerId, Speed,
    PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
  FinishedGeneration := True;
  SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
  WriteLn('Saved to ./matcha-zh-playback.wav');
  while not FinishedPlaying do
    Pa_Sleep(100);  {sleep for 0.1 second }
    {TODO(fangjun): Use an event to indicate the play is finished}
  DoneCriticalSection(CriticalSection);
  FreeAndNil(Tts);
  FreeAndNil(Resampler);
  Status := Pa_CloseStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
      Exit;
    end;
  Status := Pa_Terminate;
  if Status <> paNoError then
    begin
      WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;
 end.
--- a/pascal-api-examples/tts/matcha-zh.pas
+++ b/pascal-api-examples/tts/matcha-zh.pas
@@ -0,0 +1,57 @@
 { Copyright (c)  2025  Xiaomi Corporation }
 program matcha_zh;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with MatchaTTS models.
 It generates speech from text and saves it to a wave file.
 If you want to play it while it is generating, please see
 ./matcha-zh-playback.pas
 }
 {$mode objfpc}
 uses
  SysUtils,
  sherpa_onnx;
 function GetOfflineTts: TSherpaOnnxOfflineTts;
 var
  Config: TSherpaOnnxOfflineTtsConfig;
 begin
  Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx';
  Config.Model.Matcha.Vocoder := './hifigan_v2.onnx';
  Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt';
  Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt';
  Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict';
  Config.Model.NumThreads := 1;
  Config.Model.Debug := False;
  Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst';
  Config.MaxNumSentences := 1;
  Result := TSherpaOnnxOfflineTts.Create(Config);
 end;
 var
  Tts: TSherpaOnnxOfflineTts;
  Audio: TSherpaOnnxGeneratedAudio;
  Text: AnsiString;
  Speed: Single = 1.0;  {Use a larger value to speak faster}
  SpeakerId: Integer = 0;
 begin
  Tts := GetOfflineTts;
  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  Text := '某某银行的副行长和一些行政领导表示，他们去过长江和长白山; 经济不断增长。2024年12月31号，拨打110或者18920240511。123456块钱。';
  Audio :=  Tts.Generate(Text, SpeakerId, Speed);
  SherpaOnnxWriteWave('./matcha-zh.wav', Audio.Samples, Audio.SampleRate);
  WriteLn('Saved to ./matcha-zh.wav');
  FreeAndNil(Tts);
 end.
--- a/pascal-api-examples/tts/piper-playback.pas
+++ b/pascal-api-examples/tts/piper-playback.pas
@@ -1,5 +1,5 @@
 { Copyright (c)  2024  Xiaomi Corporation }
-program piper;
+program piper_playback;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with Piper models.
--- a/pascal-api-examples/tts/run-matcha-en-playback.sh
+++ b/pascal-api-examples/tts/run-matcha-en-playback.sh
@@ -0,0 +1,53 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  popd
 fi
 # please visit
 # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
 # matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
 # to download more models
 if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  rm matcha-icefall-en_US-ljspeech.tar.bz2
 fi
 if [ ! -f ./hifigan_v2.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
  ./matcha-en-playback.pas
 # Please see ../portaudio-test/README.md
 # for how to install portaudio on macOS
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./matcha-en-playback
--- a/pascal-api-examples/tts/run-matcha-en.sh
+++ b/pascal-api-examples/tts/run-matcha-en.sh
@@ -0,0 +1,49 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  popd
 fi
 # please visit
 # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
 # matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
 # to download more models
 if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
  tar xf matcha-icefall-en_US-ljspeech.tar.bz2
  rm matcha-icefall-en_US-ljspeech.tar.bz2
 fi
 if [ ! -f ./hifigan_v2.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  ./matcha-en.pas
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./matcha-en
--- a/pascal-api-examples/tts/run-matcha-zh-playback.sh
+++ b/pascal-api-examples/tts/run-matcha-zh-playback.sh
@@ -0,0 +1,52 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  popd
 fi
 # please visit
 # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
 # to download more models
 if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  tar xvf matcha-icefall-zh-baker.tar.bz2
  rm matcha-icefall-zh-baker.tar.bz2
 fi
 if [ ! -f ./hifigan_v2.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
  ./matcha-zh-playback.pas
 # Please see ../portaudio-test/README.md
 # for how to install portaudio on macOS
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./matcha-zh-playback
--- a/pascal-api-examples/tts/run-matcha-zh.sh
+++ b/pascal-api-examples/tts/run-matcha-zh.sh
@@ -0,0 +1,48 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  popd
 fi
 # please visit
 # https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
 # to download more models
 if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
  tar xvf matcha-icefall-zh-baker.tar.bz2
  rm matcha-icefall-zh-baker.tar.bz2
 fi
 if [ ! -f ./hifigan_v2.onnx ]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  ./matcha-zh.pas
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./matcha-zh
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
@@ -62,11 +62,26 @@ type
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
  end;
  TSherpaOnnxOfflineTtsMatchaModelConfig = record
    AcousticModel: AnsiString;
    Vocoder: AnsiString;
    Lexicon: AnsiString;
    Tokens: AnsiString;
    DataDir: AnsiString;
    NoiseScale: Single;
    LengthScale: Single;
    DictDir: AnsiString;
    function ToString: AnsiString;
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
  end;
  TSherpaOnnxOfflineTtsModelConfig = record
    Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
    NumThreads: Integer;
    Debug: Boolean;
    Provider: AnsiString;
    Matcha: TSherpaOnnxOfflineTtsMatchaModelConfig;
    function ToString: AnsiString;
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
@@ -713,11 +728,23 @@ type
    DictDir: PAnsiChar;
  end;
  SherpaOnnxOfflineTtsMatchaModelConfig = record
    AcousticModel: PAnsiChar;
    Vocoder: PAnsiChar;
    Lexicon: PAnsiChar;
    Tokens: PAnsiChar;
    DataDir: PAnsiChar;
    NoiseScale: cfloat;
    LengthScale: cfloat;
    DictDir: PAnsiChar;
  end;
  SherpaOnnxOfflineTtsModelConfig = record
    Vits: SherpaOnnxOfflineTtsVitsModelConfig;
    NumThreads: cint32;
    Debug: cint32;
    Provider: PAnsiChar;
    Matcha: SherpaOnnxOfflineTtsMatchaModelConfig;
  end;
  SherpaOnnxOfflineTtsConfig = record
@@ -1853,15 +1880,40 @@ begin
  Dest.LengthScale := 1.0;
 end;
 function TSherpaOnnxOfflineTtsMatchaModelConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineTtsMatchaModelConfig(' +
    'AcousticModel := %s, ' +
    'Vocoder := %s, ' +
    'Lexicon := %s, ' +
    'Tokens := %s, ' +
    'DataDir := %s, ' +
    'NoiseScale := %.2f, ' +
    'LengthScale := %.2f, ' +
    'DictDir := %s' +
    ')',
    [Self.AcousticModel, Self.Vocoder, Self.Lexicon, Self.Tokens,
     Self.DataDir, Self.NoiseScale, Self.LengthScale, Self.DictDir
    ]);
 end;
 class operator TSherpaOnnxOfflineTtsMatchaModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig);
 begin
  Dest.NoiseScale := 0.667;
  Dest.LengthScale := 1.0;
 end;
 function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
    'Vits := %s, ' +
    'NumThreads := %d, ' +
    'Debug := %s, ' +
-    'Provider := %s' +
+    'Provider := %s, ' +
    'Matcha := %s' +
    ')',
-    [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider
+    [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider,
     Self.Matcha.ToString
    ]);
 end;
@@ -1905,6 +1957,15 @@ begin
  C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
  C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir);
  C.Model.Matcha.AcousticModel := PAnsiChar(Config.Model.Matcha.AcousticModel);
  C.Model.Matcha.Vocoder := PAnsiChar(Config.Model.Matcha.Vocoder);
  C.Model.Matcha.Lexicon := PAnsiChar(Config.Model.Matcha.Lexicon);
  C.Model.Matcha.Tokens := PAnsiChar(Config.Model.Matcha.Tokens);
  C.Model.Matcha.DataDir := PAnsiChar(Config.Model.Matcha.DataDir);
  C.Model.Matcha.NoiseScale := Config.Model.Matcha.NoiseScale;
  C.Model.Matcha.LengthScale := Config.Model.Matcha.LengthScale;
  C.Model.Matcha.DictDir := PAnsiChar(Config.Model.Matcha.DictDir);
  C.Model.NumThreads := Config.Model.NumThreads;
  C.Model.Provider := PAnsiChar(Config.Model.Provider);
  C.Model.Debug := Ord(Config.Model.Debug);