Add Lazarus example for generating subtitles using Silero VAD with non-streaming ASR (#1251)

This commit is contained in:
Fangjun Kuang
2024-08-15 22:19:45 +08:00
committed by GitHub
parent 97a6a2a16a
commit fbe35ba736
32 changed files with 1697 additions and 14 deletions

30
lazarus-examples/.gitignore vendored Normal file
View File

@@ -0,0 +1,30 @@
# Lazarus compiler-generated binaries (safe to delete)
*.exe
*.dll
*.so
*.dylib
*.lrs
*.res
*.compiled
*.dbg
*.ppu
*.o
*.or
*.a
# Lazarus autogenerated files (duplicated info)
*.rst
*.rsj
*.lrt
# Lazarus local files (user-specific info)
*.lps
# Lazarus backups and unit output folders.
# These can be changed by user in Lazarus/project options.
backup/
*.bak
lib/
# Application bundle for Mac OS
*.app/

View File

@@ -0,0 +1,3 @@
generate_subtitles.app
generate_subtitles
generate_subtitles.dSYM

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB

View File

@@ -0,0 +1,208 @@
<?xml version="1.0" encoding="UTF-8"?>
<CONFIG>
<ProjectOptions>
<Version Value="12"/>
<PathDelim Value="\"/>
<General>
<SessionStorage Value="InProjectDir"/>
<Title Value="generate_subtitles"/>
<Scaled Value="True"/>
<ResourceType Value="res"/>
<UseXPManifest Value="True"/>
<XPManifest>
<DpiAware Value="True"/>
</XPManifest>
<Icon Value="0"/>
</General>
<BuildModes>
<Item Name="Default" Default="True"/>
<Item Name="Debug">
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<Target>
<Filename Value="generate_subtitles"/>
</Target>
<SearchPaths>
<IncludeFiles Value="$(ProjOutDir)"/>
<Libraries Value="..\..\build-static\install\lib;..\..\build\install\lib"/>
<OtherUnitFiles Value="..\..\sherpa-onnx\pascal-api"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<Parsing>
<SyntaxOptions>
<IncludeAssertionCode Value="True"/>
</SyntaxOptions>
</Parsing>
<CodeGeneration>
<Checks>
<IOChecks Value="True"/>
<RangeChecks Value="True"/>
<OverflowChecks Value="True"/>
<StackChecks Value="True"/>
</Checks>
<VerifyObjMethodCallValidity Value="True"/>
</CodeGeneration>
<Linking>
<Debugging>
<DebugInfoType Value="dsDwarf3"/>
<UseHeaptrc Value="True"/>
<TrashVariables Value="True"/>
<StripSymbols Value="True"/>
<UseExternalDbgSyms Value="True"/>
</Debugging>
<Options>
<Win32>
<GraphicApplication Value="True"/>
</Win32>
</Options>
</Linking>
</CompilerOptions>
</Item>
<Item Name="Release">
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<Target>
<Filename Value="generate_subtitles"/>
</Target>
<SearchPaths>
<IncludeFiles Value="$(ProjOutDir)"/>
<Libraries Value="..\..\build-static\install\lib;..\..\build\install\lib"/>
<OtherUnitFiles Value="..\..\sherpa-onnx\pascal-api"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<CodeGeneration>
<SmartLinkUnit Value="True"/>
<Optimizations>
<OptimizationLevel Value="3"/>
</Optimizations>
</CodeGeneration>
<Linking>
<Debugging>
<GenerateDebugInfo Value="False"/>
<RunWithoutDebug Value="True"/>
<StripSymbols Value="True"/>
</Debugging>
<LinkSmart Value="True"/>
<Options>
<Win32>
<GraphicApplication Value="True"/>
</Win32>
</Options>
</Linking>
</CompilerOptions>
</Item>
<Item Name="Release-Linux">
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<Target>
<Filename Value="generate_subtitles"/>
</Target>
<SearchPaths>
<IncludeFiles Value="$(ProjOutDir)"/>
<Libraries Value="..\..\build-static\install\lib;..\..\build\install\lib"/>
<OtherUnitFiles Value="..\..\sherpa-onnx\pascal-api"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<CodeGeneration>
<SmartLinkUnit Value="True"/>
<Optimizations>
<OptimizationLevel Value="3"/>
</Optimizations>
</CodeGeneration>
<Linking>
<Debugging>
<GenerateDebugInfo Value="False"/>
<RunWithoutDebug Value="True"/>
<StripSymbols Value="True"/>
</Debugging>
<LinkSmart Value="True"/>
<Options>
<Win32>
<GraphicApplication Value="True"/>
</Win32>
</Options>
</Linking>
<Other>
<CustomOptions Value="-dSHERPA_ONNX_USE_SHARED_LIBS"/>
</Other>
</CompilerOptions>
</Item>
</BuildModes>
<PublishOptions>
<Version Value="2"/>
<UseFileFilters Value="True"/>
</PublishOptions>
<RunParams>
<FormatVersion Value="2"/>
</RunParams>
<RequiredPackages>
<Item>
<PackageName Value="LCL"/>
</Item>
</RequiredPackages>
<Units>
<Unit>
<Filename Value="generate_subtitles.lpr"/>
<IsPartOfProject Value="True"/>
</Unit>
<Unit>
<Filename Value="unit1.pas"/>
<IsPartOfProject Value="True"/>
<ComponentName Value="Form1"/>
<HasResources Value="True"/>
<ResourceBaseClass Value="Form"/>
<UnitName Value="Unit1"/>
</Unit>
<Unit>
<Filename Value="my_worker.pas"/>
<IsPartOfProject Value="True"/>
</Unit>
</Units>
</ProjectOptions>
<CompilerOptions>
<Version Value="11"/>
<PathDelim Value="\"/>
<Target>
<Filename Value="generate_subtitles"/>
</Target>
<SearchPaths>
<IncludeFiles Value="$(ProjOutDir)"/>
<Libraries Value="..\..\build-static\install\lib;..\..\build\install\lib"/>
<OtherUnitFiles Value="..\..\sherpa-onnx\pascal-api"/>
<UnitOutputDirectory Value="lib\$(TargetCPU)-$(TargetOS)"/>
</SearchPaths>
<CodeGeneration>
<Optimizations>
<OptimizationLevel Value="2"/>
</Optimizations>
</CodeGeneration>
<Linking>
<Debugging>
<GenerateDebugInfo Value="False"/>
<DebugInfoType Value="dsDwarf3"/>
<StripSymbols Value="True"/>
</Debugging>
<Options>
<Win32>
<GraphicApplication Value="True"/>
</Win32>
</Options>
</Linking>
</CompilerOptions>
<Debugging>
<Exceptions>
<Item>
<Name Value="EAbort"/>
</Item>
<Item>
<Name Value="ECodetoolError"/>
</Item>
<Item>
<Name Value="EFOpenError"/>
</Item>
</Exceptions>
</Debugging>
</CONFIG>

View File

@@ -0,0 +1,26 @@
program generate_subtitles;
{$mode objfpc}{$H+}
uses
{$IFDEF UNIX}
cthreads,
cmem,
{$ENDIF}
{$IFDEF HASAMIGA}
athreads,
{$ENDIF}
Interfaces, // this includes the LCL widgetset
Forms, unit1, my_worker
{ you can add units after this };
{$R *.res}
begin
RequireDerivedFormResource:=True;
Application.Scaled:=True;
Application.Initialize;
Application.CreateForm(TForm1, Form1);
Application.Run;
end.

View File

@@ -0,0 +1,160 @@
unit my_worker;
{$mode ObjFPC}{$H+}
{
See
https://wiki.lazarus.freepascal.org/Multithreaded_Application_Tutorial
https://www.freepascal.org/docs-html/rtl/classes/tthread.html
}
interface
uses
{$IFDEF UNIX}
cthreads,
cmem,
{$ENDIF}
{$IFDEF HASAMIGA}
athreads,
{$ENDIF}
Classes, SysUtils;
type
TMyWorkerThread = class(TThread)
private
Status: AnsiString;
StartTime: Single;
StopTime: Single;
TotalDuration: Single;
procedure ShowStatus;
procedure ShowProgress;
protected
procedure Execute; override;
public
WaveFilename: AnsiString;
Constructor Create(CreateSuspended : boolean; Filename: AnsiString);
end;
var
MyWorkerThread: TMyWorkerThread;
implementation
uses
unit1, sherpa_onnx;
constructor TMyWorkerThread.Create(CreateSuspended : boolean; Filename: AnsiString);
begin
inherited Create(CreateSuspended);
WaveFilename := Filename;
FreeOnTerminate := True;
end;
procedure TMyWorkerThread.ShowStatus;
begin
Form1.UpdateResult(Status, StartTime, StopTime, TotalDuration);
end;
procedure TMyWorkerThread.ShowProgress;
begin
Form1.UpdateProgress(StopTime, TotalDuration);
end;
procedure TMyWorkerThread.Execute;
var
Wave: TSherpaOnnxWave;
WindowSize: Integer;
Offset: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Duration: Single;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
begin
Wave := SherpaOnnxReadWave(WaveFilename);
TotalDuration := 0;
StartTime := 0;
StopTime := 0;
if (Wave.Samples = nil) or (Length(Wave.Samples) = 0) then
begin
Status := Format('Failed to read %s. We only support 1 channel, 16000Hz, 16-bit encoded wave files',
[Wavefilename]);
Synchronize(@ShowStatus);
Exit;
end;
if Wave.SampleRate <> 16000 then
begin
Status := Format('Expected sample rate 16000. Given %d. Please select a new file', [Wave.SampleRate]);
Synchronize(@ShowStatus);
Exit;
end;
TotalDuration := Length(Wave.Samples) / Wave.SampleRate;
WindowSize := Form1.Vad.Config.SileroVad.WindowSize;
Offset := 0;
Form1.Vad.Reset;
while not Terminated and (Offset + WindowSize <= Length(Wave.Samples)) do
begin
Form1.Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Offset += WindowSize;
StopTime := Offset / Wave.SampleRate;
if (Offset mod 20480) = 0 then
Synchronize(@ShowProgress);
while not Terminated and not Form1.Vad.IsEmpty do
begin
SpeechSegment := Form1.Vad.Front;
Form1.Vad.Pop;
Stream := Form1.OfflineRecognizer.CreateStream;
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Form1.OfflineRecognizer.Decode(Stream);
RecognitionResult := Form1.OfflineRecognizer.GetResult(Stream);
StartTime := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
StopTime := StartTime + Duration;
Status := RecognitionResult.Text;
Synchronize(@ShowStatus);
FreeAndNil(Stream);
end;
end;
Form1.Vad.Flush;
while not Terminated and not Form1.Vad.IsEmpty do
begin
SpeechSegment := Form1.Vad.Front;
Form1.Vad.Pop;
Stream := Form1.OfflineRecognizer.CreateStream;
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Form1.OfflineRecognizer.Decode(Stream);
RecognitionResult := Form1.OfflineRecognizer.GetResult(Stream);
StartTime := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
StopTime := StartTime + Duration;
Status := RecognitionResult.Text;
Synchronize(@ShowStatus);
FreeAndNil(Stream);
end;
if Terminated then
Status := 'Cancelled!'
else
Status := 'DONE!';
Synchronize(@ShowStatus);
end;
end.

View File

@@ -0,0 +1,74 @@
object Form1: TForm1
Left = 366
Height = 623
Top = 117
Width = 852
Caption = 'Next-gen Kaldi: Generate Subtitles'
ClientHeight = 623
ClientWidth = 852
OnClose = FormClose
OnCreate = FormCreate
LCLVersion = '3.4.0.0'
object FileNameEdt: TEdit
Left = 200
Height = 22
Top = 40
Width = 440
TabOrder = 0
OnChange = FileNameEdtChange
end
object SelectFileBtn: TButton
Left = 96
Height = 25
Top = 40
Width = 97
Caption = 'Select a file...'
TabOrder = 1
OnClick = SelectFileBtnClick
end
object StartBtn: TButton
Left = 656
Height = 25
Top = 37
Width = 75
Caption = 'Start'
TabOrder = 2
OnClick = StartBtnClick
end
object InitBtn: TButton
Left = 280
Height = 25
Top = 8
Width = 280
Caption = 'Click me to intialize models before you start'
TabOrder = 3
OnClick = InitBtnClick
end
object ResultMemo: TMemo
Left = 24
Height = 488
Top = 72
Width = 800
ScrollBars = ssAutoBoth
TabOrder = 4
end
object ProgressBar: TProgressBar
Left = 32
Height = 16
Top = 592
Width = 792
TabOrder = 5
end
object ProgressLabel: TLabel
Left = 770
Height = 16
Top = 568
Width = 8
Caption = '0'
end
object SelectFileDlg: TOpenDialog
Title = 'Open a wave file'
Left = 600
Top = 488
end
end

View File

@@ -0,0 +1,502 @@
unit Unit1;
{$mode objfpc}{$H+}
{$IFDEF DARWIN}
{$modeswitch objectivec1} {For getting resource directory}
{$ENDIF}
interface
uses
Classes, SysUtils, StrUtils, Forms, Controls,
Graphics, Dialogs, StdCtrls,
sherpa_onnx, ComCtrls;
type
{ TForm1 }
TForm1 = class(TForm)
InitBtn: TButton;
ProgressBar: TProgressBar;
ResultMemo: TMemo;
StartBtn: TButton;
SelectFileDlg: TOpenDialog;
SelectFileBtn: TButton;
FileNameEdt: TEdit;
ProgressLabel: TLabel;
procedure FileNameEdtChange(Sender: TObject);
procedure FormClose(Sender: TObject; var CloseAction: TCloseAction);
procedure InitBtnClick(Sender: TObject);
procedure SelectFileBtnClick(Sender: TObject);
procedure FormCreate(Sender: TObject);
procedure StartBtnClick(Sender: TObject);
private
public
procedure UpdateResult(
Msg: AnsiString;
StartTime: Single;
StopTime: Single;
TotalDuration: Single);
procedure UpdateProgress(StopTime: Single; TotalDuration: Single);
public
Vad: TSherpaOnnxVoiceActivityDetector;
OfflineRecognizer: TSherpaOnnxOfflineRecognizer;
end;
var
Form1: TForm1;
implementation
uses
my_worker
{$IFDEF DARWIN}
,MacOSAll
,CocoaAll
{$ENDIF}
;
{See https://wiki.lazarus.freepascal.org/Locating_the_macOS_application_resources_directory}
{$IFDEF DARWIN}
{Note: The returned path contains a trailing /}
function GetResourcesPath(): AnsiString;
var
pathStr: shortstring;
status: Boolean = false;
begin
status := CFStringGetPascalString(CFStringRef(NSBundle.mainBundle.resourcePath), @pathStr, 255, CFStringGetSystemEncoding());
if status = true then
Result := pathStr + PathDelim
else
raise Exception.Create('Error in GetResourcesPath()');
end;
{$ENDIF}
function CreateVad(VadFilename: AnsiString): TSherpaOnnxVoiceActivityDetector;
var
Config: TSherpaOnnxVadModelConfig;
SampleRate: Integer;
WindowSize: Integer;
begin
Initialize(Config);
SampleRate := 16000; {Please don't change it unless you know the details}
WindowSize := 512; {Please don't change it unless you know the details}
Config.SileroVad.Model := VadFilename;
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 2;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
end;
function CreateOfflineRecognizerTransducer(
Tokens: AnsiString;
Encoder: AnsiString;
Decoder: AnsiString;
Joiner: AnsiString;
ModelType: AnsiString): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.Transducer.Encoder := Encoder;
Config.ModelConfig.Transducer.Decoder := Decoder;
Config.ModelConfig.Transducer.Joiner := Joiner;
Config.ModelConfig.ModelType := ModelType;
Config.ModelConfig.Tokens := Tokens;
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 2;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
function CreateOfflineRecognizerTeleSpeech(
Tokens: AnsiString;
TeleSpeech: AnsiString): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.TeleSpeechCtc := TeleSpeech;
Config.ModelConfig.Tokens := Tokens;
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 2;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
function CreateOfflineRecognizerParaformer(
Tokens: AnsiString;
Paraformer: AnsiString): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.Paraformer.Model := Paraformer;
Config.ModelConfig.Tokens := Tokens;
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 2;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
function CreateOfflineRecognizerSenseVoice(
Tokens: AnsiString;
SenseVoice: AnsiString): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.SenseVoice.Model := SenseVoice;
Config.ModelConfig.SenseVoice.Language := 'auto';
Config.ModelConfig.SenseVoice.UseItn := True;
Config.ModelConfig.Tokens := Tokens;
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 2;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
function CreateOfflineRecognizerWhisper(
Tokens: AnsiString;
WhisperEncoder: AnsiString;
WhisperDecoder: AnsiString): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.Whisper.Encoder := WhisperEncoder;
Config.ModelConfig.Whisper.Decoder := WhisperDecoder;
Config.ModelConfig.Tokens := Tokens;
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 2;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
{$R *.lfm}
{ TForm1 }
procedure TForm1.FormCreate(Sender: TObject);
begin
StartBtn.Enabled := False;
SelectFileDlg.Filter := 'All Files|*.wav';
FileNameEdt.Enabled := False;
SelectFileBtn.Enabled := False;
ResultMemo.Lines.Add('1. It supports only 1 channel, 16-bit, 16000Hz wav files');
ResultMemo.Lines.Add('2. There should be no Chinese characters in the file path.');
ProgressBar.Position := 0;
ProgressLabel.Caption := '';
end;
procedure TForm1.StartBtnClick(Sender: TObject);
begin
if StartBtn.Caption = 'Stop' then
begin
if (MyWorkerThread <> nil) and not MyWorkerThread.Finished then
MyWorkerThread.Terminate;
StartBtn.Caption := 'Start';
Exit;
end;
ResultMemo.Lines.Clear();
ResultMemo.Lines.Add('Start processing');
ProgressBar.Position := 0;
ProgressLabel.Caption := Format('%d%%', [ProgressBar.Position]);
MyWorkerThread := TMyWorkerThread.Create(False, FileNameEdt.Text);
StartBtn.Caption := 'Stop';
end;
procedure TForm1.SelectFileBtnClick(Sender: TObject);
begin
if SelectFileDlg.Execute then
begin
FileNameEdt.Text := SelectFileDlg.FileName;
end;
end;
procedure TForm1.FileNameEdtChange(Sender: TObject);
begin
if FileExists(FileNameEdt.Text) then
StartBtn.Enabled := True
else
StartBtn.Enabled := False;
end;
procedure TForm1.FormClose(Sender: TObject; var CloseAction: TCloseAction);
begin
if (MyWorkerThread <> nil) and not MyWorkerThread.Finished then
begin
MyWorkerThread.Terminate;
MyWorkerThread.WaitFor;
end;
FreeAndNil(Vad);
FreeAndNil(OfflineRecognizer);
end;
procedure TForm1.UpdateProgress(StopTime: Single; TotalDuration: Single);
var
Percent: Single;
begin
if (StopTime <> 0) and (TotalDuration <> 0) then
begin
Percent := StopTime / TotalDuration * 100;
ProgressBar.Position := Round(Percent);
ProgressLabel.Caption := Format('%d%%', [ProgressBar.Position]);
end;
end;
procedure TForm1.UpdateResult(
Msg: AnsiString;
StartTime: Single;
StopTime: Single;
TotalDuration: Single);
var
NewResult: AnsiString;
begin
UpdateProgress(StopTime, TotalDuration);
if (Msg = 'DONE!') or
(Msg = 'Cancelled!') or
EndsStr('16-bit encoded wave files', Msg) or
EndsStr('. Please select a new file', Msg) then
begin
Form1.StartBtn.Caption := 'Start';
NewResult := Msg;
end
else
begin
NewResult := Format('%.3f -- %.3f %s', [StartTime, StopTime, Msg]);
end;
if Msg = 'DONE!' then
begin
ProgressBar.Position := 100;
ProgressLabel.Caption := '100%';
end;
Form1.ResultMemo.Lines.Add(NewResult);
end;
procedure TForm1.InitBtnClick(Sender: TObject);
var
Msg: AnsiString;
ModelDir: AnsiString;
VadFilename: AnsiString;
Tokens: AnsiString;
WhisperEncoder: AnsiString;
WhisperDecoder: AnsiString;
SenseVoice: AnsiString;
Paraformer: AnsiString;
TeleSpeech: AnsiString;
TransducerEncoder: AnsiString; // from icefall
TransducerDecoder: AnsiString;
TransducerJoiner: AnsiString;
NeMoTransducerEncoder: AnsiString;
NeMoTransducerDecoder: AnsiString;
NeMoTransducerJoiner: AnsiString;
begin
{$IFDEF DARWIN}
ModelDir := GetResourcesPath;
{$ELSE}
ModelDir := './';
{$ENDIF}
VadFilename := ModelDir + 'silero_vad.onnx';
Tokens := ModelDir + 'tokens.txt';
{
Please refer to
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/export-onnx.html#available-models
for a list of whisper models.
In the code, we use the normalized filename whisper-encoder.onnx, whisper-decoder.onnx, and tokens.txt
You need to rename the existing model files.
For instance, if you use sherpa-onnx-whisper-tiny.en, you have to do
mv tiny.en-tokens.txt tokens.txt
mv tiny.en-encoder.onnx whisper-encoder.onnx
mv tiny.en-decoder.onnx whisper-decoder.onnx
// or use the int8.onnx
mv tiny.en-encoder.int8.onnx whisper-encoder.onnx
mv tiny.en-decoder.int8.onnx whisper-decoder.onnx
}
WhisperEncoder := ModelDir + 'whisper-encoder.onnx';
WhisperDecoder := ModelDir + 'whisper-decoder.onnx';
{
Please refer to
https://k2-fsa.github.io/sherpa/onnx/sense-voice/pretrained.html#pre-trained-models
to download models for SenseVoice.
In the code, we use the normalized model name sense-voice.onnx. You have
to rename the downloaded model files.
For example, you need to use
mv model.onnx sense-voice.onnx
// or use the int8.onnx
mv model.int8.onnx sense-voice.onnx
}
SenseVoice := ModelDir + 'sense-voice.onnx';
{
Please refer to
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/index.html
to download paraformer models.
Note that you have to rename model.onnx or model.int8.onnx to paraformer.onnx.
An example is given below for the rename:
cp model.onnx paraformer.onnx
// or use int8.onnx
cp model.int8.onnx paraformer.onnx
}
Paraformer := ModelDir + 'paraformer.onnx';
{
please refer to
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/telespeech/models.html
to download TeleSpeech models.
Note that you have to rename model files after downloading. The following
is an example
mv model.onnx telespeech.onnx
// or to use int8.onnx
mv model.int8.onnx telespeech.onnx
}
TeleSpeech := ModelDir + 'telespeech.onnx';
{
Please refer to
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
to download an icefall offline transducer model. Note that you need to rename the
model files to transducer-encoder.onnx, transducer-decoder.onnx, and
transducer-joiner.onnx
}
TransducerEncoder := ModelDir + 'transducer-encoder.onnx';
TransducerDecoder := ModelDir + 'transducer-decoder.onnx';
TransducerJoiner := ModelDir + 'transducer-joiner.onnx';
{
Please visit
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
to donwload a NeMo transducer model.
}
NeMoTransducerEncoder := ModelDir + 'nemo-transducer-encoder.onnx';
NeMoTransducerDecoder := ModelDir + 'nemo-transducer-decoder.onnx';
NeMoTransducerJoiner := ModelDir + 'nemo-transducer-joiner.onnx';
if not FileExists(VadFilename) then
begin
ShowMessage(VadFilename + ' does not exist! Please download it from' +
sLineBreak + 'https://github.com/k2-fsa/sherpa-onnx/tree/asr-models'
);
Exit;
end;
Self.Vad := CreateVad(VadFilename);
if not FileExists(Tokens) then
begin
ShowMessage(Tokens + ' not found. Please download a non-streaming ASR model first!');
Exit;
end;
if FileExists(WhisperEncoder) and FileExists(WhisperDecoder) then
begin
OfflineRecognizer := CreateOfflineRecognizerWhisper(Tokens, WhisperEncoder, WhisperDecoder);
Msg := 'Whisper';
end
else if FileExists(SenseVoice) then
begin
OfflineRecognizer := CreateOfflineRecognizerSenseVoice(Tokens, SenseVoice);
Msg := 'SenseVoice';
end
else if FileExists(Paraformer) then
begin
OfflineRecognizer := CreateOfflineRecognizerParaformer(Tokens, Paraformer);
Msg := 'Paraformer';
end
else if FileExists(TeleSpeech) then
begin
OfflineRecognizer := CreateOfflineRecognizerTeleSpeech(Tokens, TeleSpeech);
Msg := 'TeleSpeech';
end
else if FileExists(TransducerEncoder) and FileExists(TransducerDecoder) and FileExists(TransducerJoiner) then
begin
OfflineRecognizer := CreateOfflineRecognizerTransducer(Tokens,
TransducerEncoder, TransducerDecoder, TransducerJoiner, 'transducer');
Msg := 'Zipformer transducer';
end
else if FileExists(NeMoTransducerEncoder) and FileExists(NeMoTransducerDecoder) and FileExists(NeMoTransducerJoiner) then
begin
OfflineRecognizer := CreateOfflineRecognizerTransducer(Tokens,
NeMoTransducerEncoder, NeMoTransducerDecoder, NeMoTransducerJoiner, 'nemo_transducer');
Msg := 'NeMo transducer';
end
else
begin
ShowMessage('Please download at least one non-streaming speech recognition model first.');
Exit;
end;
MessageDlg('Congrat! The ' + Msg + ' model is initialized succesfully!', mtInformation, [mbOk], 0);
FileNameEdt.Enabled := True;
SelectFileBtn.Enabled := True;
InitBtn.Enabled := False;
end;
end.