diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index 45378141..aed04e28 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -154,6 +154,12 @@ jobs: ls -lh echo "---" + ./run-kokoro-zh-en.sh + rm -rf kokoro-multi-* + rm kokoro-zh-en + ls -lh + echo "---" + ./run-kokoro-en.sh rm -rf kokoro-en-* rm kokoro-en diff --git a/pascal-api-examples/tts/.gitignore b/pascal-api-examples/tts/.gitignore index 42900595..a4f0ef0d 100644 --- a/pascal-api-examples/tts/.gitignore +++ b/pascal-api-examples/tts/.gitignore @@ -8,3 +8,5 @@ matcha-zh-playback matcha-en-playback kokoro-en kokoro-en-playback +kokoro-zh-en +kokoro-zh-en-playback diff --git a/pascal-api-examples/tts/kokoro-zh-en-playback.pas b/pascal-api-examples/tts/kokoro-zh-en-playback.pas new file mode 100644 index 00000000..22d36dea --- /dev/null +++ b/pascal-api-examples/tts/kokoro-zh-en-playback.pas @@ -0,0 +1,241 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program kokoro_en_playback; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Kokoro models (Chinese + English). + +It generates speech from text and saves it to a wave file. + +Note that it plays the audio back as it is still generating. +} + +{$mode objfpc} + +uses + {$ifdef unix} + cthreads, + {$endif} + SysUtils, + dos, + ctypes, + portaudio, + sherpa_onnx; + +var + CriticalSection: TRTLCriticalSection; + + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + Resampler: TSherpaOnnxLinearResampler; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 47; + Buffer: TSherpaOnnxCircularBuffer; + FinishedGeneration: Boolean = False; + FinishedPlaying: Boolean = False; + + Version: String; + EnvStr: String; + Status: Integer; + NumDevices: Integer; + DeviceIndex: Integer; + DeviceInfo: PPaDeviceInfo; + + { If you get EDivByZero: Division by zero error, please change the sample rate + to the one supported by your microphone. + } + DeviceSampleRate: Integer = 48000; + I: Integer; + Param: TPaStreamParameters; + Stream: PPaStream; + Wave: TSherpaOnnxWave; + +function GenerateCallback( + Samples: pcfloat; N: cint32; + Arg: Pointer): cint; cdecl; +begin + EnterCriticalSection(CriticalSection); + try + if Resampler <> nil then + Buffer.Push(Resampler.Resample(Samples, N, False)) + else + Buffer.Push(Samples, N); + finally + LeaveCriticalSection(CriticalSection); + end; + + { 1 means to continue generating; 0 means to stop generating. } + Result := 1; +end; + +function PlayCallback( + input: Pointer; output: Pointer; + frameCount: culong; + timeInfo: PPaStreamCallbackTimeInfo; + statusFlags: TPaStreamCallbackFlags; + userData: Pointer ): cint; cdecl; +var + Samples: TSherpaOnnxSamplesArray; + I: Integer; +begin + EnterCriticalSection(CriticalSection); + try + if Buffer.Size >= frameCount then + begin + Samples := Buffer.Get(Buffer.Head, FrameCount); + Buffer.Pop(FrameCount); + end + else if Buffer.Size > 0 then + begin + Samples := Buffer.Get(Buffer.Head, Buffer.Size); + Buffer.Pop(Buffer.Size); + SetLength(Samples, frameCount); + end + else + SetLength(Samples, frameCount); + + for I := 0 to frameCount - 1 do + pcfloat(output)[I] := Samples[I]; + + if (Buffer.Size > 0) or (not FinishedGeneration) then + Result := paContinue + else + begin + Result := paComplete; + FinishedPlaying := True; + end; + finally + LeaveCriticalSection(CriticalSection); + end; +end; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Kokoro.Model := './kokoro-multi-lang-v1_0/model.onnx'; + Config.Model.Kokoro.Voices := './kokoro-multi-lang-v1_0/voices.bin'; + Config.Model.Kokoro.Tokens := './kokoro-multi-lang-v1_0/tokens.txt'; + Config.Model.Kokoro.DataDir := './kokoro-multi-lang-v1_0/espeak-ng-data'; + Config.Model.Kokoro.DictDir := './kokoro-multi-lang-v1_0/dict'; + Config.Model.Kokoro.Lexicon := './kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt'; + Config.Model.NumThreads := 2; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +begin + Tts := GetOfflineTts; + if Tts.GetSampleRate <> DeviceSampleRate then + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate); + + Version := String(Pa_GetVersionText); + WriteLn('Version is ', Version); + Status := Pa_Initialize; + if Status <> paNoError then + begin + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; + + NumDevices := Pa_GetDeviceCount; + WriteLn('Num devices: ', NumDevices); + + DeviceIndex := Pa_GetDefaultOutputDevice; + + if DeviceIndex = paNoDevice then + begin + WriteLn('No default output device found'); + Pa_Terminate; + Exit; + end; + + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE'); + if EnvStr <> '' then + begin + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex); + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr); + end; + + for I := 0 to (NumDevices - 1) do + begin + DeviceInfo := Pa_GetDeviceInfo(I); + if I = DeviceIndex then + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) } + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)])) + else + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)])); + end; + + WriteLn('Use device ', DeviceIndex); + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name); + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels); + + Initialize(Param); + Param.Device := DeviceIndex; + Param.ChannelCount := 1; + Param.SampleFormat := paFloat32; + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency; + param.HostApiSpecificStreamInfo := nil; + + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate); + + + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. } + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag, + PPaStreamCallback(@PlayCallback), nil); + + if Status <> paNoError then + begin + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + InitCriticalSection(CriticalSection); + + Status := Pa_StartStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := '中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?'; + + Audio := Tts.Generate(Text, SpeakerId, Speed, + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); + FinishedGeneration := True; + SherpaOnnxWriteWave('./kokoro-zh-en-playback-47.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./kokoro-zh-en-playback-47.wav'); + + while not FinishedPlaying do + Pa_Sleep(100); {sleep for 0.1 second } + {TODO(fangjun): Use an event to indicate the play is finished} + + DoneCriticalSection(CriticalSection); + + FreeAndNil(Tts); + FreeAndNil(Resampler); + + Status := Pa_CloseStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status)); + Exit; + end; + + Status := Pa_Terminate; + if Status <> paNoError then + begin + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; +end. + diff --git a/pascal-api-examples/tts/kokoro-zh-en.pas b/pascal-api-examples/tts/kokoro-zh-en.pas new file mode 100644 index 00000000..d34e8648 --- /dev/null +++ b/pascal-api-examples/tts/kokoro-zh-en.pas @@ -0,0 +1,57 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program kokoro_en; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Kokoro TTS models (Chinese + English). + +It generates speech from text and saves it to a wave file. + +If you want to play it while it is generating, please see +./kokoro-en-playback.pas +} + +{$mode objfpc} + +uses + SysUtils, + sherpa_onnx; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Kokoro.Model := './kokoro-multi-lang-v1_0/model.onnx'; + Config.Model.Kokoro.Voices := './kokoro-multi-lang-v1_0/voices.bin'; + Config.Model.Kokoro.Tokens := './kokoro-multi-lang-v1_0/tokens.txt'; + Config.Model.Kokoro.DataDir := './kokoro-multi-lang-v1_0/espeak-ng-data'; + Config.Model.Kokoro.DictDir := './kokoro-multi-lang-v1_0/dict'; + Config.Model.Kokoro.Lexicon := './kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt'; + Config.Model.NumThreads := 2; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +var + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 46; + +begin + Tts := GetOfflineTts; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := '中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?'; + + Audio := Tts.Generate(Text, SpeakerId, Speed); + SherpaOnnxWriteWave('./kokoro-zh-en-46.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./kokoro-zh-en-46.wav'); + + FreeAndNil(Tts); +end. + diff --git a/pascal-api-examples/tts/run-kokoro-zh-en-playback.sh b/pascal-api-examples/tts/run-kokoro-zh-en-playback.sh new file mode 100755 index 00000000..cc6e2526 --- /dev/null +++ b/pascal-api-examples/tts/run-kokoro-zh-en-playback.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html +if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 + tar xf kokoro-multi-lang-v1_0.tar.bz2 + rm kokoro-multi-lang-v1_0.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \ + ./kokoro-zh-en-playback.pas + +# Please see ../portaudio-test/README.md +# for how to install portaudio on macOS + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./kokoro-zh-en-playback diff --git a/pascal-api-examples/tts/run-kokoro-zh-en.sh b/pascal-api-examples/tts/run-kokoro-zh-en.sh new file mode 100755 index 00000000..36b58348 --- /dev/null +++ b/pascal-api-examples/tts/run-kokoro-zh-en.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/kokoro.html +if [ ! -f ./kokoro-multi-lang-v1_0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 + tar xf kokoro-multi-lang-v1_0.tar.bz2 + rm kokoro-multi-lang-v1_0.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./kokoro-zh-en.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./kokoro-zh-en diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 182d440a..9163a63c 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -82,6 +82,8 @@ type Tokens: AnsiString; DataDir: AnsiString; LengthScale: Single; + DictDir: AnsiString; + Lexicon: AnsiString; function ToString: AnsiString; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); @@ -757,6 +759,8 @@ type Tokens: PAnsiChar; DataDir: PAnsiChar; LengthScale: cfloat; + DictDir: PAnsiChar; + Lexicon: PAnsiChar; end; SherpaOnnxOfflineTtsModelConfig = record @@ -1931,9 +1935,12 @@ begin 'Voices := %s, ' + 'Tokens := %s, ' + 'DataDir := %s, ' + - 'LengthScale := %.2f' + + 'LengthScale := %.2f, ' + + 'DictDir := %s, ' + + 'Lexicon := %s' + ')', - [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale]); + [Self.Model, Self.Voices, Self.Tokens, Self.DataDir, Self.LengthScale, + Self.DictDir, Self.Lexicon]); end; class operator TSherpaOnnxOfflineTtsKokoroModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsKokoroModelConfig); @@ -2010,6 +2017,8 @@ begin C.Model.Kokoro.Tokens := PAnsiChar(Config.Model.Kokoro.Tokens); C.Model.Kokoro.DataDir := PAnsiChar(Config.Model.Kokoro.DataDir); C.Model.Kokoro.LengthScale := Config.Model.Kokoro.LengthScale; + C.Model.Kokoro.DictDir := PAnsiChar(Config.Model.Kokoro.DictDir); + C.Model.Kokoro.Lexicon := PAnsiChar(Config.Model.Kokoro.Lexicon); C.Model.NumThreads := Config.Model.NumThreads; C.Model.Provider := PAnsiChar(Config.Model.Provider);