This repository has been archived on 2025-08-26. You can view files and clone it, but cannot push or open issues or pull requests.
Files
enginex_bi_series-sherpa-onnx/pascal-api-examples/tts/matcha-zh-playback.pas
2025-03-17 17:05:15 +08:00

242 lines
6.6 KiB
ObjectPascal
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{ Copyright (c) 2025 Xiaomi Corporation }
program matcha_zh_playback;
{
This file shows how to use the text to speech API of sherpa-onnx
with MatchaTTS models.
It generates speech from text and saves it to a wave file.
Note that it plays the audio back as it is still generating.
}
{$mode objfpc}
uses
{$ifdef unix}
cthreads,
{$endif}
SysUtils,
dos,
ctypes,
portaudio,
sherpa_onnx;
var
CriticalSection: TRTLCriticalSection;
Tts: TSherpaOnnxOfflineTts;
Audio: TSherpaOnnxGeneratedAudio;
Resampler: TSherpaOnnxLinearResampler;
Text: AnsiString;
Speed: Single = 1.0; {Use a larger value to speak faster}
SpeakerId: Integer = 0;
Buffer: TSherpaOnnxCircularBuffer;
FinishedGeneration: Boolean = False;
FinishedPlaying: Boolean = False;
Version: String;
EnvStr: String;
Status: Integer;
NumDevices: Integer;
DeviceIndex: Integer;
DeviceInfo: PPaDeviceInfo;
{ If you get EDivByZero: Division by zero error, please change the sample rate
to the one supported by your microphone.
}
DeviceSampleRate: Integer = 48000;
I: Integer;
Param: TPaStreamParameters;
Stream: PPaStream;
Wave: TSherpaOnnxWave;
function GenerateCallback(
Samples: pcfloat; N: cint32;
Arg: Pointer): cint; cdecl;
begin
EnterCriticalSection(CriticalSection);
try
if Resampler <> nil then
Buffer.Push(Resampler.Resample(Samples, N, False))
else
Buffer.Push(Samples, N);
finally
LeaveCriticalSection(CriticalSection);
end;
{ 1 means to continue generating; 0 means to stop generating. }
Result := 1;
end;
function PlayCallback(
input: Pointer; output: Pointer;
frameCount: culong;
timeInfo: PPaStreamCallbackTimeInfo;
statusFlags: TPaStreamCallbackFlags;
userData: Pointer ): cint; cdecl;
var
Samples: TSherpaOnnxSamplesArray;
I: Integer;
begin
EnterCriticalSection(CriticalSection);
try
if Buffer.Size >= frameCount then
begin
Samples := Buffer.Get(Buffer.Head, FrameCount);
Buffer.Pop(FrameCount);
end
else if Buffer.Size > 0 then
begin
Samples := Buffer.Get(Buffer.Head, Buffer.Size);
Buffer.Pop(Buffer.Size);
SetLength(Samples, frameCount);
end
else
SetLength(Samples, frameCount);
for I := 0 to frameCount - 1 do
pcfloat(output)[I] := Samples[I];
if (Buffer.Size > 0) or (not FinishedGeneration) then
Result := paContinue
else
begin
Result := paComplete;
FinishedPlaying := True;
end;
finally
LeaveCriticalSection(CriticalSection);
end;
end;
function GetOfflineTts: TSherpaOnnxOfflineTts;
var
Config: TSherpaOnnxOfflineTtsConfig;
begin
Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx';
Config.Model.Matcha.Vocoder := './vocos-22khz-univ.onnx';
Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt';
Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt';
Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict';
Config.Model.NumThreads := 1;
Config.Model.Debug := False;
Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst';
Config.MaxNumSentences := 1;
Result := TSherpaOnnxOfflineTts.Create(Config);
end;
begin
Tts := GetOfflineTts;
if Tts.GetSampleRate <> DeviceSampleRate then
Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate);
Version := String(Pa_GetVersionText);
WriteLn('Version is ', Version);
Status := Pa_Initialize;
if Status <> paNoError then
begin
WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status));
Exit;
end;
NumDevices := Pa_GetDeviceCount;
WriteLn('Num devices: ', NumDevices);
DeviceIndex := Pa_GetDefaultOutputDevice;
if DeviceIndex = paNoDevice then
begin
WriteLn('No default output device found');
Pa_Terminate;
Exit;
end;
EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE');
if EnvStr <> '' then
begin
DeviceIndex := StrToIntDef(EnvStr, DeviceIndex);
WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr);
end;
for I := 0 to (NumDevices - 1) do
begin
DeviceInfo := Pa_GetDeviceInfo(I);
if I = DeviceIndex then
{ WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) }
WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)]))
else
WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)]));
end;
WriteLn('Use device ', DeviceIndex);
WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name);
WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels);
Initialize(Param);
Param.Device := DeviceIndex;
Param.ChannelCount := 1;
Param.SampleFormat := paFloat32;
param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency;
param.HostApiSpecificStreamInfo := nil;
Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate);
{ Note(fangjun): PortAudio invokes PlayCallback in a separate thread. }
Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag,
PPaStreamCallback(@PlayCallback), nil);
if Status <> paNoError then
begin
WriteLn('Failed to open stream, ', Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;
InitCriticalSection(CriticalSection);
Status := Pa_StartStream(stream);
if Status <> paNoError then
begin
WriteLn('Failed to start stream, ', Pa_GetErrorText(Status));
Pa_Terminate;
Exit;
end;
WriteLn('There are ', Tts.GetNumSpeakers, ' speakers');
Text := '某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号拨打110或者18920240511。123456块钱。';
Audio := Tts.Generate(Text, SpeakerId, Speed,
PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil);
FinishedGeneration := True;
SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate);
WriteLn('Saved to ./matcha-zh-playback.wav');
while not FinishedPlaying do
Pa_Sleep(100); {sleep for 0.1 second }
{TODO(fangjun): Use an event to indicate the play is finished}
DoneCriticalSection(CriticalSection);
FreeAndNil(Tts);
FreeAndNil(Resampler);
Status := Pa_CloseStream(stream);
if Status <> paNoError then
begin
WriteLn('Failed to close stream, ', Pa_GetErrorText(Status));
Exit;
end;
Status := Pa_Terminate;
if Status <> paNoError then
begin
WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status));
Exit;
end;
end.