Text to speech API for Object Pascal. (#1273)

2024-08-20 20:52:16 +08:00
parent e34a1a2aa3
commit 5a2aa110b8
14 changed files with 905 additions and 22 deletions
--- a/.github/workflows/pascal.yaml
+++ b/.github/workflows/pascal.yaml
@@ -119,13 +119,29 @@ jobs:
            cp -v install/lib/*.dll ../pascal-api-examples/vad
            cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
-            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
+            cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav
-            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
+            cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr
-            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
+            cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr
-            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
+            cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad
-            cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
+            cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr
            cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
          fi
      - name:  Run Pascal test (TTS)
        shell: bash
        run: |
          export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
          cd ./pascal-api-examples
          pushd tts
          ./run-piper.sh
          rm -rf vits-piper-*
          ls -lh
          echo "---"
          popd
      - name:  Run Pascal test (VAD + non-streaming ASR)
        shell: bash
        run: |
--- a/pascal-api-examples/.gitignore
+++ b/pascal-api-examples/.gitignore
@@ -0,0 +1 @@
 link*.res
--- a/pascal-api-examples/README.md
+++ b/pascal-api-examples/README.md
@@ -13,3 +13,5 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
 |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
 |[vad](./vad)| It shows how to use the voice activity detection API.|
 |[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
 |[portaudio-test](./portaudio-test)| It shows how to use PortAudio for recording and playing.|
 |[tts](./tts)| It shows how to use the text-to-speech API.|
--- a/pascal-api-examples/tts/.gitignore
+++ b/pascal-api-examples/tts/.gitignore
@@ -0,0 +1,4 @@
 !run-*.sh
 piper
 piper-playback
 link*.res
--- a/pascal-api-examples/tts/README.md
+++ b/pascal-api-examples/tts/README.md
@@ -0,0 +1,9 @@
 # Introduction
 This directory contains examples for how to use the TTS (text to speech) APIs.
 |Directory| Description|
 |---------|------------|
 |[run-piper.sh](./run-piper.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech.|
 |[run-piper-playback.sh](./run-piper-playback.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech. It plays the generated audio as it is still generating. |
--- a/pascal-api-examples/tts/piper-playback.pas
+++ b/pascal-api-examples/tts/piper-playback.pas
@@ -0,0 +1,238 @@
 { Copyright (c)  2024  Xiaomi Corporation }
 program piper;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with Piper models.
 It generates speech from text and saves it to a wave file.
 Note that it plays the audio back as it is still generating.
 }
 {$mode objfpc}
 uses
  {$ifdef unix}
  cthreads,
  {$endif}
  SysUtils,
  dos,
  ctypes,
  portaudio,
  sherpa_onnx;
 var
  CriticalSection: TRTLCriticalSection;
  Tts: TSherpaOnnxOfflineTts;
  Audio: TSherpaOnnxGeneratedAudio;
  Resampler: TSherpaOnnxLinearResampler;
  Text: AnsiString;
  Speed: Single = 1.0;  {Use a larger value to speak faster}
  SpeakerId: Integer = 0;
  Buffer: TSherpaOnnxCircularBuffer;
  FinishedGeneration: Boolean = False;
  FinishedPlaying: Boolean = False;
  Version: String;
  EnvStr: String;
  Status: Integer;
  NumDevices: Integer;
  DeviceIndex: Integer;
  DeviceInfo: PPaDeviceInfo;
  { If you get EDivByZero: Division by zero error, please change the sample rate
    to the one supported by your microphone.
  }
  DeviceSampleRate: Integer = 48000;
  I: Integer;
  Param: TPaStreamParameters;
  Stream: PPaStream;
  Wave: TSherpaOnnxWave;
 function GenerateCallback(
      Samples: pcfloat; N: cint32;
      Arg: Pointer): cint; cdecl;
 begin
  EnterCriticalSection(CriticalSection);
  try
    if Resampler <> nil then
      Buffer.Push(Resampler.Resample(Samples, N, False))
    else
      Buffer.Push(Samples, N);
  finally
    LeaveCriticalSection(CriticalSection);
  end;
  { 1 means to continue generating; 0 means to stop generating. }
  Result := 1;
 end;
 function PlayCallback(
      input: Pointer; output: Pointer;
      frameCount: culong;
      timeInfo: PPaStreamCallbackTimeInfo;
      statusFlags: TPaStreamCallbackFlags;
      userData: Pointer ): cint; cdecl;
 var
  Samples: TSherpaOnnxSamplesArray;
  I: Integer;
 begin
  EnterCriticalSection(CriticalSection);
  try
    if Buffer.Size >= frameCount then
      begin
        Samples := Buffer.Get(Buffer.Head, FrameCount);
        Buffer.Pop(FrameCount);
      end
    else if Buffer.Size > 0 then
      begin
        Samples := Buffer.Get(Buffer.Head, Buffer.Size);
        Buffer.Pop(Buffer.Size);
        SetLength(Samples, frameCount);
      end
    else
      SetLength(Samples, frameCount);
    for I := 0 to frameCount - 1 do
      pcfloat(output)[I] := Samples[I];
    if (Buffer.Size > 0) or (not FinishedGeneration) then
      Result := paContinue
    else
      begin
        Result := paComplete;
        FinishedPlaying := True;
      end;
  finally
    LeaveCriticalSection(CriticalSection);
  end;
 end;
 function GetOfflineTts: TSherpaOnnxOfflineTts;
 var
  Config: TSherpaOnnxOfflineTtsConfig;
 begin
  Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
  Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
  Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
  Config.Model.NumThreads := 1;
  Config.Model.Debug := False;
  Config.MaxNumSentences := 1;
  Result := TSherpaOnnxOfflineTts.Create(Config);
 end;
 begin
  Tts := GetOfflineTts;
  if Tts.GetSampleRate <> DeviceSampleRate then
    Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
  Version := String(Pa_GetVersionText);
  WriteLn('Version is ', Version);
  Status := Pa_Initialize;
  if Status <> paNoError then
    begin
      WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;
  NumDevices := Pa_GetDeviceCount;
  WriteLn('Num devices: ', NumDevices);
  DeviceIndex := Pa_GetDefaultOutputDevice;
  if DeviceIndex = paNoDevice then
    begin
      WriteLn('No default output device found');
      Pa_Terminate;
      Exit;
    end;
  EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
  if EnvStr <> '' then
    begin
      DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
      WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
    end;
  for I := 0 to (NumDevices - 1) do
    begin
      DeviceInfo := Pa_GetDeviceInfo(I);
      if I = DeviceIndex then
        { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
        WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
      else
        WriteLn(Format('   %d %s', [I, AnsiString(DeviceInfo^.Name)]));
    end;
  WriteLn('Use device ', DeviceIndex);
  WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
  WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
  Initialize(Param);
  Param.Device := DeviceIndex;
  Param.ChannelCount := 1;
  Param.SampleFormat := paFloat32;
  param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
  param.HostApiSpecificStreamInfo := nil;
  Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
  { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
  Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
    PPaStreamCallback(@PlayCallback), nil);
  if Status <> paNoError then
    begin
      WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;
  InitCriticalSection(CriticalSection);
  Status := Pa_StartStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
      Pa_Terminate;
      Exit;
    end;
  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
  Audio :=  Tts.Generate(Text, SpeakerId, Speed,
    PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
  FinishedGeneration := True;
  SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
  WriteLn('Saved to ./libritts_r-generated.wav');
  while not FinishedPlaying do
    Pa_Sleep(100);  {sleep for 0.1 second }
    {TODO(fangjun): Use an event to indicate the play is finished}
  DoneCriticalSection(CriticalSection);
  FreeAndNil(Tts);
  FreeAndNil(Resampler);
  Status := Pa_CloseStream(stream);
  if Status <> paNoError then
    begin
      WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
      Exit;
    end;
  Status := Pa_Terminate;
  if Status <> paNoError then
    begin
      WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
      Exit;
    end;
 end.
--- a/pascal-api-examples/tts/piper.pas
+++ b/pascal-api-examples/tts/piper.pas
@@ -0,0 +1,54 @@
 { Copyright (c)  2024  Xiaomi Corporation }
 program piper;
 {
 This file shows how to use the text to speech API of sherpa-onnx
 with Piper models.
 It generates speech from text and saves it to a wave file.
 If you want to play it while it is generating, please see
 ./piper-playback.pas
 }
 {$mode objfpc}
 uses
  SysUtils,
  sherpa_onnx;
 function GetOfflineTts: TSherpaOnnxOfflineTts;
 var
  Config: TSherpaOnnxOfflineTtsConfig;
 begin
  Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx';
  Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt';
  Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data';
  Config.Model.NumThreads := 1;
  Config.Model.Debug := False;
  Config.MaxNumSentences := 1;
  Result := TSherpaOnnxOfflineTts.Create(Config);
 end;
 var
  Tts: TSherpaOnnxOfflineTts;
  Audio: TSherpaOnnxGeneratedAudio;
  Text: AnsiString;
  Speed: Single = 1.0;  {Use a larger value to speak faster}
  SpeakerId: Integer = 0;
 begin
  Tts := GetOfflineTts;
  WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
  Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.';
  Audio :=  Tts.Generate(Text, SpeakerId, Speed);
  SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate);
  WriteLn('Saved to ./libritts_r-generated.wav');
  FreeAndNil(Tts);
 end.
--- a/pascal-api-examples/tts/run-piper-playback.sh
+++ b/pascal-api-examples/tts/run-piper-playback.sh
@@ -0,0 +1,45 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  popd
 fi
 if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
  tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
  rm vits-piper-en_US-libritts_r-medium.tar.bz2
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  -Fl/usr/local/Cellar/portaudio/19.7.0/lib \
  ./piper-playback.pas
 # Please see ../portaudio-test/README.md
 # for how to install portaudio on macOS
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./piper-playback
--- a/pascal-api-examples/tts/run-piper.sh
+++ b/pascal-api-examples/tts/run-piper.sh
@@ -0,0 +1,41 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
 echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
 if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib  && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
  mkdir -p ../../build
  pushd ../../build
  cmake \
    -DCMAKE_INSTALL_PREFIX=./install \
    -DSHERPA_ONNX_ENABLE_PYTHON=OFF \
    -DSHERPA_ONNX_ENABLE_TESTS=OFF \
    -DSHERPA_ONNX_ENABLE_CHECK=OFF \
    -DBUILD_SHARED_LIBS=ON \
    -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
    ..
  cmake --build . --target install --config Release
  popd
 fi
 if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then
  curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2
  tar xf vits-piper-en_US-libritts_r-medium.tar.bz2
  rm vits-piper-en_US-libritts_r-medium.tar.bz2
 fi
 fpc \
  -dSHERPA_ONNX_USE_SHARED_LIBS \
  -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
  -Fl$SHERPA_ONNX_DIR/build/install/lib \
  ./piper.pas
 export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
 export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
 ./piper
--- a/scripts/apk/generate-tts-apk-script.py
+++ b/scripts/apk/generate-tts-apk-script.py
@@ -190,9 +190,9 @@ def get_piper_models() -> List[TtsModel]:
        TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
        TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
        TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
-        TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
+        #  TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
-        TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
+        #  TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
-        TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
+        #  TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
        TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
        TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
        TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
--- a/scripts/flutter/generate-tts.py
+++ b/scripts/flutter/generate-tts.py
@@ -180,9 +180,9 @@ def get_piper_models() -> List[TtsModel]:
        TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"),
        TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"),
        TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"),
-        TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
+        #  TtsModel(model_dir="vits-piper-nl_NL-mls-medium"),
-        TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
+        #  TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"),
-        TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
+        #  TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"),
        TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"),
        TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"),
        TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"),
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -18,6 +18,7 @@
 #include "sherpa-onnx/csrc/offline-punctuation.h"
 #include "sherpa-onnx/csrc/offline-recognizer.h"
 #include "sherpa-onnx/csrc/online-recognizer.h"
 #include "sherpa-onnx/csrc/resample.h"
 #include "sherpa-onnx/csrc/speaker-embedding-extractor.h"
 #include "sherpa-onnx/csrc/speaker-embedding-manager.h"
 #include "sherpa-onnx/csrc/spoken-language-identification.h"
@@ -1584,3 +1585,56 @@ const char *SherpaOfflinePunctuationAddPunct(
 }
 void SherpaOfflinePunctuationFreeText(const char *text) { delete[] text; }
 struct SherpaOnnxLinearResampler {
  std::unique_ptr<sherpa_onnx::LinearResample> impl;
 };
 SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
    int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
    int32_t num_zeros) {
  SherpaOnnxLinearResampler *p = new SherpaOnnxLinearResampler;
  p->impl = std::make_unique<sherpa_onnx::LinearResample>(
      samp_rate_in_hz, samp_rate_out_hz, filter_cutoff_hz, num_zeros);
  return p;
 }
 void SherpaOnnxDestroyLinearResampler(SherpaOnnxLinearResampler *p) {
  delete p;
 }
 const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
    SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
    int32_t flush) {
  std::vector<float> o;
  p->impl->Resample(input, input_dim, flush, &o);
  float *s = new float[o.size()];
  std::copy(o.begin(), o.end(), s);
  SherpaOnnxResampleOut *ans = new SherpaOnnxResampleOut;
  ans->samples = s;
  ans->n = static_cast<int32_t>(o.size());
  return ans;
 }
 void SherpaOnnxLinearResamplerResampleFree(const SherpaOnnxResampleOut *p) {
  delete[] p->samples;
  delete p;
 }
 int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
    const SherpaOnnxLinearResampler *p) {
  return p->impl->GetInputSamplingRate();
 }
 int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
    const SherpaOnnxLinearResampler *p) {
  return p->impl->GetOutputSamplingRate();
 }
 void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) {
  p->impl->Reset();
 }
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -1315,6 +1315,52 @@ SHERPA_ONNX_API const char *SherpaOfflinePunctuationAddPunct(
 SHERPA_ONNX_API void SherpaOfflinePunctuationFreeText(const char *text);
 // for resampling
 SHERPA_ONNX_API typedef struct SherpaOnnxLinearResampler
    SherpaOnnxLinearResampler;
 /*
      float min_freq = min(sampling_rate_in_hz, samp_rate_out_hz);
      float lowpass_cutoff = 0.99 * 0.5 * min_freq;
      int32_t lowpass_filter_width = 6;
      You can set filter_cutoff_hz to lowpass_cutoff
      sand set num_zeros to lowpass_filter_width
 */
 // The user has to invoke SherpaOnnxDestroyLinearResampler()
 // to free the returned pointer to avoid memory leak
 SHERPA_ONNX_API SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler(
    int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz,
    int32_t num_zeros);
 SHERPA_ONNX_API void SherpaOnnxDestroyLinearResampler(
    SherpaOnnxLinearResampler *p);
 SHERPA_ONNX_API void SherpaOnnxLinearResamplerReset(
    SherpaOnnxLinearResampler *p);
 typedef struct SherpaOnnxResampleOut {
  const float *samples;
  int32_t n;
 } SherpaOnnxResampleOut;
 // The user has to invoke SherpaOnnxLinearResamplerResampleFree()
 // to free the returned pointer to avoid memory leak.
 //
 // If this is the last segment, you can set flush to 1; otherwise, please
 // set flush to 0
 SHERPA_ONNX_API const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample(
    SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim,
    int32_t flush);
 SHERPA_ONNX_API void SherpaOnnxLinearResamplerResampleFree(
    const SherpaOnnxResampleOut *p);
 SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate(
    const SherpaOnnxLinearResampler *p);
 SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate(
    const SherpaOnnxLinearResampler *p);
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
--- a/sherpa-onnx/pascal-api/sherpa_onnx.pas
+++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas
@@ -1,4 +1,9 @@
-{ Copyright (c)  2024  Xiaomi Corporation }
+{ Copyright (c)  2024  Xiaomi Corporation
 Please see
 https://github.com/k2-fsa/sherpa-onnx/tree/master/pascal-api-examples
 for how to use APIs in this file.
 }
 unit sherpa_onnx;
@@ -7,13 +12,105 @@ unit sherpa_onnx;
  {$modeSwitch advancedRecords} { to support records with methods }
 {$ENDIF}
-(* {$LongStrings ON} *)
+{$LongStrings ON}
 interface
 uses
  ctypes;
 type
  TSherpaOnnxSamplesArray = array of Single;
  TSherpaOnnxLinearResampler = class
  private
    Handle: Pointer;
    InputSampleRate: Integer;
    OutputSampleRate: Integer;
  public
    constructor Create(SampleRateIn: Integer; SampleRateOut: Integer);
    destructor Destroy; override;
    function Resample(Samples: pcfloat;
      N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray; overload;
    function Resample(Samples: array of Single;
      Flush: Boolean): TSherpaOnnxSamplesArray; overload;
    procedure Reset;
    property GetInputSampleRate: Integer Read InputSampleRate;
    property GetOutputSampleRate: Integer Read OutputSampleRate;
  end;
  PSherpaOnnxGeneratedAudioCallbackWithArg = ^TSherpaOnnxGeneratedAudioCallbackWithArg;
  TSherpaOnnxGeneratedAudioCallbackWithArg = function(
      Samples: pcfloat; N: cint32;
      Arg: Pointer): cint; cdecl;
  TSherpaOnnxOfflineTtsVitsModelConfig = record
    Model: AnsiString;
    Lexicon: AnsiString;
    Tokens: AnsiString;
    DataDir: AnsiString;
    NoiseScale: Single;
    NoiseScaleW: Single;
    LengthScale: Single;
    DictDir: AnsiString;
    function ToString: AnsiString;
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
  end;
  TSherpaOnnxOfflineTtsModelConfig = record
    Vits: TSherpaOnnxOfflineTtsVitsModelConfig;
    NumThreads: Integer;
    Debug: Boolean;
    Provider: AnsiString;
    function ToString: AnsiString;
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
  end;
  TSherpaOnnxOfflineTtsConfig = record
    Model: TSherpaOnnxOfflineTtsModelConfig;
    RuleFsts: AnsiString;
    MaxNumSentences: Integer;
    RuleFars: AnsiString;
    function ToString: AnsiString;
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
  end;
  TSherpaOnnxGeneratedAudio = record
    Samples: array of Single;
    SampleRate: Integer;
  end;
  TSherpaOnnxOfflineTts = class
  private
   Handle: Pointer;
   SampleRate: Integer;
   NumSpeakers: Integer;
   _Config: TSherpaOnnxOfflineTtsConfig;
  public
    constructor Create(Config: TSherpaOnnxOfflineTtsConfig);
    destructor Destroy; override;
    function Generate(Text: AnsiString; SpeakerId: Integer;
      Speed: Single): TSherpaOnnxGeneratedAudio; overload;
    function Generate(Text: AnsiString; SpeakerId: Integer;
      Speed: Single;
      Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
      Arg: Pointer
      ): TSherpaOnnxGeneratedAudio; overload;
    property GetHandle: Pointer Read Handle;
    property GetSampleRate: Integer Read SampleRate;
    property GetNumSpeakers: Integer Read NumSpeakers;
  end;
  TSherpaOnnxWave = record
    Samples: array of Single; { normalized to the range [-1, 1] }
    SampleRate: Integer;
@@ -254,7 +351,6 @@ type
    class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
  end;
  TSherpaOnnxSamplesArray = array of Single;
  TSherpaOnnxCircularBuffer = class
  private
@@ -508,6 +604,94 @@ type
  PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
  SherpaOnnxOfflineTtsVitsModelConfig = record
    Model: PAnsiChar;
    Lexicon: PAnsiChar;
    Tokens: PAnsiChar;
    DataDir: PAnsiChar;
    NoiseScale: cfloat;
    NoiseScaleW: cfloat;
    LengthScale: cfloat;
    DictDir: PAnsiChar;
  end;
  SherpaOnnxOfflineTtsModelConfig = record
    Vits: SherpaOnnxOfflineTtsVitsModelConfig;
    NumThreads: cint32;
    Debug: cint32;
    Provider: PAnsiChar;
  end;
  SherpaOnnxOfflineTtsConfig = record
    Model: SherpaOnnxOfflineTtsModelConfig;
    RuleFsts: PAnsiChar;
    MaxNumSentences: cint32;
    RuleFars: PAnsiChar;
  end;
  PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig;
  SherpaOnnxGeneratedAudio = record
    Samples: pcfloat;
    N: cint32;
    SampleRate: cint32;
  end;
  PSherpaOnnxGeneratedAudio = ^SherpaOnnxGeneratedAudio;
  SherpaOnnxResampleOut = record
    Samples: pcfloat;
    N: cint32;
  end;
  PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut;
 function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32;
  SampleRateOutHz: cint32;
  FilterCutoffHz: cfloat;
  NumZeros: cint32): Pointer; cdecl;
  external SherpaOnnxLibName;
 procedure SherpaOnnxDestroyLinearResampler(P: Pointer); cdecl;
  external SherpaOnnxLibName;
 function SherpaOnnxLinearResamplerResample(P: Pointer;
  Samples: pcfloat;
  N: Integer;
  Flush: Integer): PSherpaOnnxResampleOut; cdecl;
  external SherpaOnnxLibName;
 procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdecl;
  external SherpaOnnxLibName;
 procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl;
  external SherpaOnnxLibName;
 function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl;
  external SherpaOnnxLibName;
 procedure SherpaOnnxDestroyOfflineTts(Tts: Pointer); cdecl;
  external SherpaOnnxLibName;
 function SherpaOnnxOfflineTtsSampleRate(Tts: Pointer): cint32; cdecl;
  external SherpaOnnxLibName;
 function SherpaOnnxOfflineTtsNumSpeakers(Tts: Pointer): cint32; cdecl;
  external SherpaOnnxLibName;
 function SherpaOnnxOfflineTtsGenerate(Tts: Pointer;
  Text: PAnsiChar; Sid: cint32; Speed: cfloat): PSherpaOnnxGeneratedAudio; cdecl;
  external SherpaOnnxLibName;
 function SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Tts: Pointer;
  Text: PAnsiChar; Sid: cint32; Speed: cfloat;
  Callback: PSherpaOnnxGeneratedAudioCallbackWithArg;
  Arg: Pointer): PSherpaOnnxGeneratedAudio; cdecl;
  external SherpaOnnxLibName;
 procedure SherpaOnnxDestroyOfflineTtsGeneratedAudio(Audio: Pointer); cdecl;
  external SherpaOnnxLibName;
 function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
  BufferSizeInSeconds: cfloat): Pointer; cdecl;
  external SherpaOnnxLibName;
@@ -793,8 +977,7 @@ constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecogniz
 var
  C: SherpaOnnxOnlineRecognizerConfig;
 begin
-  Initialize(C);
+  C := Default(SherpaOnnxOnlineRecognizerConfig);
  C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
  C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
@@ -1051,8 +1234,7 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn
 var
  C: SherpaOnnxOfflineRecognizerConfig;
 begin
-  Initialize(C);
+  C := Default(SherpaOnnxOfflineRecognizerConfig);
  C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate;
  C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim;
@@ -1369,12 +1551,11 @@ end;
 constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
 var
-  C: SherpaOnnxVadModelConfig;
+  C: SherpaOnnxVadModelConfig ;
 begin
  C := Default(SherpaOnnxVadModelConfig);
  Self._Config := Config;
  Initialize(C);
  C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
  C.SileroVad.Threshold := Config.SileroVad.Threshold;
  C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
@@ -1460,5 +1641,197 @@ begin
  SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
 end;
-end.
+function TSherpaOnnxOfflineTtsVitsModelConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineTtsVitsModelConfig(' +
    'Model := %s, ' +
    'Lexicon := %s, ' +
    'Tokens := %s, ' +
    'DataDir := %s, ' +
    'NoiseScale := %.2f, ' +
    'NoiseScaleW := %.2f, ' +
    'LengthScale := %.2f, ' +
    'DictDir := %s' +
    ')',
    [Self.Model, Self.Lexicon, Self.Tokens, Self.DataDir, Self.NoiseScale,
     Self.NoiseScaleW, Self.LengthScale, Self.DictDir
    ]);
 end;
 class operator TSherpaOnnxOfflineTtsVitsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig);
 begin
  Dest.NoiseScale := 0.667;
  Dest.NoiseScaleW := 0.8;
  Dest.LengthScale := 1.0;
 end;
 function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineTtsModelConfig(' +
    'Vits := %s, ' +
    'NumThreads := %d, ' +
    'Debug := %s, ' +
    'Provider := %s' +
    ')',
    [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider
    ]);
 end;
 class operator TSherpaOnnxOfflineTtsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig);
 begin
  Dest.NumThreads := 1;
  Dest.Debug := False;
  Dest.Provider := 'cpu';
 end;
 function TSherpaOnnxOfflineTtsConfig.ToString: AnsiString;
 begin
  Result := Format('TSherpaOnnxOfflineTtsConfig(' +
    'Model := %s, ' +
    'RuleFsts := %s, ' +
    'MaxNumSentences := %d, ' +
    'RuleFars := %s' +
    ')',
    [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars
    ]);
 end;
 class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig);
 begin
  Dest.MaxNumSentences := 1;
 end;
 constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig);
 var
  C: SherpaOnnxOfflineTtsConfig;
 begin
  C := Default(SherpaOnnxOfflineTtsConfig);
  Self._Config := Config;
  C.Model.Vits.Model := PAnsiChar(Config.Model.Vits.Model);
  C.Model.Vits.Lexicon := PAnsiChar(Config.Model.Vits.Lexicon);
  C.Model.Vits.Tokens := PAnsiChar(Config.Model.Vits.Tokens);
  C.Model.Vits.DataDir := PAnsiChar(Config.Model.Vits.DataDir);
  C.Model.Vits.NoiseScale := Config.Model.Vits.NoiseScale;
  C.Model.Vits.NoiseScaleW := Config.Model.Vits.NoiseScaleW;
  C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale;
  C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir);
  C.Model.NumThreads := Config.Model.NumThreads;
  C.Model.Provider := PAnsiChar(Config.Model.Provider);
  C.Model.Debug := Ord(Config.Model.Debug);
  C.RuleFsts := PAnsiChar(Config.RuleFsts);
  C.MaxNumSentences := Config.MaxNumSentences;
  C.RuleFars := PAnsiChar(Config.RuleFars);
  Self.Handle := SherpaOnnxCreateOfflineTts(@C);
  Self.SampleRate := SherpaOnnxOfflineTtsSampleRate(Self.Handle);
  Self.NumSpeakers := SherpaOnnxOfflineTtsNumSpeakers(Self.Handle);
 end;
 destructor TSherpaOnnxOfflineTts.Destroy;
 begin
  SherpaOnnxDestroyOfflineTts(Self.Handle);
  Self.Handle := nil;
 end;
 function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
  Speed: Single): TSherpaOnnxGeneratedAudio;
 var
  Audio: PSherpaOnnxGeneratedAudio;
  I: Integer;
 begin
  Result := Default(TSherpaOnnxGeneratedAudio);
  Audio := SherpaOnnxOfflineTtsGenerate(Self.Handle, PAnsiChar(Text), SpeakerId, Speed);
  SetLength(Result.Samples, Audio^.N);
  Result.SampleRate := Audio^.SampleRate;
  for I := Low(Result.Samples) to High(Result.Samples) do
  begin
    Result.Samples[I] := Audio^.Samples[I];
  end;
  SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
 end;
 function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer;
  Speed: Single;
  Callback:PSherpaOnnxGeneratedAudioCallbackWithArg;
  Arg: Pointer
  ): TSherpaOnnxGeneratedAudio;
 var
  Audio: PSherpaOnnxGeneratedAudio;
  I: Integer;
 begin
  Result := Default(TSherpaOnnxGeneratedAudio);
  Audio := SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Self.Handle, PAnsiChar(Text),
    SpeakerId, Speed, Callback, Arg);
  SetLength(Result.Samples, Audio^.N);
  Result.SampleRate := Audio^.SampleRate;
  for I := Low(Result.Samples) to High(Result.Samples) do
  begin
    Result.Samples[I] := Audio^.Samples[I];
  end;
  SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
 end;
 constructor TSherpaOnnxLinearResampler.Create(SampleRateIn: Integer; SampleRateOut: Integer);
 var
  MinFreq: Single;
  LowpassCutoff: Single;
  LowpassFilterWidth: Integer = 6;
 begin
  if SampleRateIn > SampleRateOut then
    MinFreq := SampleRateOut
  else
    MinFreq := SampleRateIn;
  LowpassCutoff := 0.99 * 0.5 * MinFreq;
  Self.Handle := SherpaOnnxCreateLinearResampler(SampleRateIn,
    SampleRateOut, LowpassCutoff, LowpassFilterWidth);
  Self.InputSampleRate := SampleRateIn;
  Self.OutputSampleRate := SampleRateOut;
 end;
 destructor TSherpaOnnxLinearResampler.Destroy;
 begin
  SherpaOnnxDestroyLinearResampler(Self.Handle);
  Self.Handle := nil;
 end;
 function TSherpaOnnxLinearResampler.Resample(Samples: pcfloat;
  N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray;
 var
  P: PSherpaOnnxResampleOut;
  I: Integer;
 begin
  Result := Default(TSherpaOnnxSamplesArray);
  P := SherpaOnnxLinearResamplerResample(Self.Handle, Samples, N, Ord(Flush));
  SetLength(Result, P^.N);
  for I := Low(Result) to High(Result) do
    Result[I] := P^.Samples[I];
  SherpaOnnxLinearResamplerResampleFree(P);
 end;
 function TSherpaOnnxLinearResampler.Resample(Samples: array of Single; Flush: Boolean): TSherpaOnnxSamplesArray;
 begin
  Result := Self.Resample(pcfloat(Samples), Length(Samples), Flush);
 end;
 procedure TSherpaOnnxLinearResampler.Reset;
 begin
  SherpaOnnxLinearResamplerReset(Self.Handle);
 end;
 end.