Add Pascal API for Moonshine models (#1482)
This commit is contained in:
8
.github/workflows/pascal.yaml
vendored
8
.github/workflows/pascal.yaml
vendored
@@ -165,6 +165,10 @@ jobs:
|
||||
cd ./pascal-api-examples
|
||||
|
||||
pushd vad-with-non-streaming-asr
|
||||
time ./run-vad-with-moonshine.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
|
||||
time ./run-vad-with-whisper.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
@@ -220,6 +224,10 @@ jobs:
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
|
||||
./run-moonshine.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
|
||||
./run-whisper.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
|
||||
@@ -7,3 +7,4 @@ paraformer
|
||||
paraformer_itn
|
||||
sense_voice
|
||||
telespeech_ctc
|
||||
moonshine
|
||||
|
||||
80
pascal-api-examples/non-streaming-asr/moonshine.pas
Normal file
80
pascal-api-examples/non-streaming-asr/moonshine.pas
Normal file
@@ -0,0 +1,80 @@
|
||||
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||
|
||||
{
|
||||
This file shows how to use a non-streaming Moonshine model
|
||||
to decode files.
|
||||
|
||||
You can download the model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
}
|
||||
|
||||
program moonshine;
|
||||
|
||||
{$mode objfpc}
|
||||
|
||||
uses
|
||||
sherpa_onnx,
|
||||
DateUtils,
|
||||
SysUtils;
|
||||
|
||||
var
|
||||
Wave: TSherpaOnnxWave;
|
||||
WaveFilename: AnsiString;
|
||||
|
||||
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||
Stream: TSherpaOnnxOfflineStream;
|
||||
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||
|
||||
Start: TDateTime;
|
||||
Stop: TDateTime;
|
||||
|
||||
Elapsed: Single;
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx';
|
||||
Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx';
|
||||
Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx';
|
||||
Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx';
|
||||
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
Config.ModelConfig.NumThreads := 1;
|
||||
Config.ModelConfig.Debug := False;
|
||||
|
||||
WaveFilename := './sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav';
|
||||
|
||||
Wave := SherpaOnnxReadWave(WaveFilename);
|
||||
|
||||
Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||
Stream := Recognizer.CreateStream();
|
||||
Start := Now;
|
||||
|
||||
Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Stop := Now;
|
||||
|
||||
Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
|
||||
Duration := Length(Wave.Samples) / Wave.SampleRate;
|
||||
RealTimeFactor := Elapsed / Duration;
|
||||
|
||||
WriteLn(RecognitionResult.ToString);
|
||||
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
|
||||
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
|
||||
WriteLn(Format('Wave duration %.3f s', [Duration]));
|
||||
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
|
||||
|
||||
{Free resources to avoid memory leak.
|
||||
|
||||
Note: You don't need to invoke them for this simple script.
|
||||
However, you have to invoke them in your own large/complex project.
|
||||
}
|
||||
FreeAndNil(Stream);
|
||||
FreeAndNil(Recognizer);
|
||||
end.
|
||||
42
pascal-api-examples/non-streaming-asr/run-moonshine.sh
Executable file
42
pascal-api-examples/non-streaming-asr/run-moonshine.sh
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
ls -lh lib
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
|
||||
rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-dSHERPA_ONNX_USE_SHARED_LIBS \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./moonshine.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./moonshine
|
||||
@@ -1,3 +1,4 @@
|
||||
!run-*.sh
|
||||
vad_with_whisper
|
||||
vad_with_sense_voice
|
||||
vad_with_moonshine
|
||||
|
||||
49
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh
Executable file
49
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-moonshine.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
popd
|
||||
fi
|
||||
|
||||
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
fi
|
||||
|
||||
if [ ! -f ./Obama.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
|
||||
rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-dSHERPA_ONNX_USE_SHARED_LIBS \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./vad_with_moonshine.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./vad_with_moonshine
|
||||
@@ -0,0 +1,139 @@
|
||||
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||
|
||||
{
|
||||
This file shows how to use a non-streaming Moonshine model
|
||||
with silero VAD to decode files.
|
||||
|
||||
You can download the model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
}
|
||||
|
||||
program vad_with_moonshine;
|
||||
|
||||
{$mode objfpc}
|
||||
|
||||
uses
|
||||
sherpa_onnx,
|
||||
SysUtils;
|
||||
|
||||
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
|
||||
var
|
||||
Config: TSherpaOnnxVadModelConfig;
|
||||
|
||||
SampleRate: Integer;
|
||||
WindowSize: Integer;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||
WindowSize := 512; {Please don't change it unless you know the details}
|
||||
|
||||
Config.SileroVad.Model := './silero_vad.onnx';
|
||||
Config.SileroVad.MinSpeechDuration := 0.5;
|
||||
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||
Config.SileroVad.Threshold := 0.5;
|
||||
Config.SileroVad.WindowSize := WindowSize;
|
||||
Config.NumThreads:= 1;
|
||||
Config.Debug:= True;
|
||||
Config.Provider:= 'cpu';
|
||||
Config.SampleRate := SampleRate;
|
||||
|
||||
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
|
||||
end;
|
||||
|
||||
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
|
||||
var
|
||||
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Moonshine.Preprocessor := './sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx';
|
||||
Config.ModelConfig.Moonshine.Encoder := './sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx';
|
||||
Config.ModelConfig.Moonshine.UncachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx';
|
||||
Config.ModelConfig.Moonshine.CachedDecoder := './sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx';
|
||||
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-moonshine-tiny-en-int8/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
Config.ModelConfig.NumThreads := 1;
|
||||
Config.ModelConfig.Debug := False;
|
||||
|
||||
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||
end;
|
||||
|
||||
var
|
||||
Wave: TSherpaOnnxWave;
|
||||
|
||||
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||
|
||||
Offset: Integer;
|
||||
WindowSize: Integer;
|
||||
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||
|
||||
Start: Single;
|
||||
Duration: Single;
|
||||
|
||||
Stream: TSherpaOnnxOfflineStream;
|
||||
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||
begin
|
||||
Vad := CreateVad();
|
||||
Recognizer := CreateOfflineRecognizer();
|
||||
|
||||
Wave := SherpaOnnxReadWave('./Obama.wav');
|
||||
if Wave.SampleRate <> Vad.Config.SampleRate then
|
||||
begin
|
||||
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||
[Vad.Config.SampleRate, Wave.SampleRate]));
|
||||
|
||||
Exit;
|
||||
end;
|
||||
|
||||
WindowSize := Vad.Config.SileroVad.WindowSize;
|
||||
Offset := 0;
|
||||
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||
begin
|
||||
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||
Offset += WindowSize;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
end;
|
||||
|
||||
Vad.Flush;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
|
||||
FreeAndNil(Recognizer);
|
||||
FreeAndNil(Vad);
|
||||
end.
|
||||
@@ -250,6 +250,14 @@ type
|
||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
|
||||
end;
|
||||
|
||||
TSherpaOnnxOfflineMoonshineModelConfig = record
|
||||
Preprocessor: AnsiString;
|
||||
Encoder: AnsiString;
|
||||
UncachedDecoder: AnsiString;
|
||||
CachedDecoder: AnsiString;
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
|
||||
TSherpaOnnxOfflineTdnnModelConfig = record
|
||||
Model: AnsiString;
|
||||
function ToString: AnsiString;
|
||||
@@ -285,6 +293,7 @@ type
|
||||
BpeVocab: AnsiString;
|
||||
TeleSpeechCtc: AnsiString;
|
||||
SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
|
||||
Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
|
||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
@@ -617,6 +626,12 @@ type
|
||||
Task: PAnsiChar;
|
||||
TailPaddings: cint32;
|
||||
end;
|
||||
SherpaOnnxOfflineMoonshineModelConfig = record
|
||||
Preprocessor: PAnsiChar;
|
||||
Encoder: PAnsiChar;
|
||||
UncachedDecoder: PAnsiChar;
|
||||
CachedDecoder: PAnsiChar;
|
||||
end;
|
||||
SherpaOnnxOfflineTdnnModelConfig = record
|
||||
Model: PAnsiChar;
|
||||
end;
|
||||
@@ -644,6 +659,7 @@ type
|
||||
BpeVocab: PAnsiChar;
|
||||
TeleSpeechCtc: PAnsiChar;
|
||||
SenseVoice: SherpaOnnxOfflineSenseVoiceModelConfig;
|
||||
Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
|
||||
end;
|
||||
|
||||
SherpaOnnxOfflineRecognizerConfig = record
|
||||
@@ -1312,6 +1328,16 @@ begin
|
||||
[Self.Encoder, Self.Decoder, Self.Language, Self.Task, Self.TailPaddings]);
|
||||
end;
|
||||
|
||||
function TSherpaOnnxOfflineMoonshineModelConfig.ToString: AnsiString;
|
||||
begin
|
||||
Result := Format('TSherpaOnnxOfflineMoonshineModelConfig(' +
|
||||
'Preprocessor := %s, ' +
|
||||
'Encoder := %s, ' +
|
||||
'UncachedDecoder := %s, ' +
|
||||
'CachedDecoder := %s)',
|
||||
[Self.Preprocessor, Self.Encoder, Self.UncachedDecoder, Self.CachedDecoder]);
|
||||
end;
|
||||
|
||||
function TSherpaOnnxOfflineTdnnModelConfig.ToString: AnsiString;
|
||||
begin
|
||||
Result := Format('TSherpaOnnxOfflineTdnnModelConfig(Model := %s)',
|
||||
@@ -1353,13 +1379,14 @@ begin
|
||||
'ModelingUnit := %s, ' +
|
||||
'BpeVocab := %s, ' +
|
||||
'TeleSpeechCtc := %s, ' +
|
||||
'SenseVoice := %s' +
|
||||
'SenseVoice := %s, ' +
|
||||
'Moonshine := %s' +
|
||||
')',
|
||||
[Self.Transducer.ToString, Self.Paraformer.ToString,
|
||||
Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
|
||||
Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
|
||||
Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
|
||||
Self.TeleSpeechCtc, Self.SenseVoice.ToString
|
||||
Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString
|
||||
]);
|
||||
end;
|
||||
|
||||
@@ -1407,7 +1434,6 @@ begin
|
||||
|
||||
C.ModelConfig.Tdnn.Model := PAnsiChar(Config.ModelConfig.Tdnn.Model);
|
||||
|
||||
|
||||
C.ModelConfig.Tokens := PAnsiChar(Config.ModelConfig.Tokens);
|
||||
C.ModelConfig.NumThreads := Config.ModelConfig.NumThreads;
|
||||
C.ModelConfig.Debug := Ord(Config.ModelConfig.Debug);
|
||||
@@ -1421,6 +1447,11 @@ begin
|
||||
C.ModelConfig.SenseVoice.Language := PAnsiChar(Config.ModelConfig.SenseVoice.Language);
|
||||
C.ModelConfig.SenseVoice.UseItn := Ord(Config.ModelConfig.SenseVoice.UseItn);
|
||||
|
||||
C.ModelConfig.Moonshine.Preprocessor := PAnsiChar(Config.ModelConfig.Moonshine.Preprocessor);
|
||||
C.ModelConfig.Moonshine.Encoder := PAnsiChar(Config.ModelConfig.Moonshine.Encoder);
|
||||
C.ModelConfig.Moonshine.UncachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.UncachedDecoder);
|
||||
C.ModelConfig.Moonshine.CachedDecoder := PAnsiChar(Config.ModelConfig.Moonshine.CachedDecoder);
|
||||
|
||||
C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
|
||||
C.LMConfig.Scale := Config.LMConfig.Scale;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user