Pascal API for VAD (#1249)
This commit is contained in:
42
.github/workflows/pascal.yaml
vendored
42
.github/workflows/pascal.yaml
vendored
@@ -116,12 +116,54 @@ jobs:
|
|||||||
cp -v install/lib/*.dll ../pascal-api-examples/read-wav
|
cp -v install/lib/*.dll ../pascal-api-examples/read-wav
|
||||||
cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
|
cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr
|
||||||
cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
|
cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr
|
||||||
|
cp -v install/lib/*.dll ../pascal-api-examples/vad
|
||||||
|
cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr
|
||||||
|
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
|
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
|
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr
|
||||||
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
|
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr
|
||||||
|
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad
|
||||||
|
cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Run Pascal test (VAD + non-streaming ASR)
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
|
||||||
|
|
||||||
|
cd ./pascal-api-examples
|
||||||
|
|
||||||
|
pushd vad-with-non-streaming-asr
|
||||||
|
time ./run-vad-with-whisper.sh
|
||||||
|
rm -rf sherpa-onnx-*
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
time ./run-vad-with-sense-voice.sh
|
||||||
|
rm -rf sherpa-onnx-*
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
ls -lh
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
||||||
|
- name: Run Pascal test (VAD test)
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
|
||||||
|
|
||||||
|
cd ./pascal-api-examples
|
||||||
|
|
||||||
|
pushd vad
|
||||||
|
./run-circular-buffer.sh
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
time ./run-remove-silence.sh
|
||||||
|
echo "---"
|
||||||
|
|
||||||
|
ls -lh
|
||||||
|
|
||||||
|
popd
|
||||||
|
|
||||||
- name: Run Pascal test (Read wav test)
|
- name: Run Pascal test (Read wav test)
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|
|||||||
|[read-wav](./read-wav)|It shows how to read a wave file.|
|
|[read-wav](./read-wav)|It shows how to read a wave file.|
|
||||||
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|
||||||
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|
||||||
|
|[vad](./vad)| It shows how to use the voice activity detection API.|
|
||||||
|
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
|
Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
|
||||||
Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
|
Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
|
||||||
Config.ModelConfig.Provider := 'cpu';
|
Config.ModelConfig.Provider := 'cpu';
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
|
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
|
||||||
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
|
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
|
||||||
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
|
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
|
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
|
||||||
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
|
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
|
||||||
Config.ModelConfig.Provider := 'cpu';
|
Config.ModelConfig.Provider := 'cpu';
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
|
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
|
||||||
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
|
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
|
||||||
Config.ModelConfig.Provider := 'cpu';
|
Config.ModelConfig.Provider := 'cpu';
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
|
Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
|
||||||
Config.ModelConfig.SenseVoice.Language := 'auto';
|
Config.ModelConfig.SenseVoice.Language := 'auto';
|
||||||
Config.ModelConfig.SenseVoice.UseItn := False;
|
Config.ModelConfig.SenseVoice.UseItn := False;
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
|
Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
|
||||||
Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
|
Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
|
||||||
Config.ModelConfig.Provider := 'cpu';
|
Config.ModelConfig.Provider := 'cpu';
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||||
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||||
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ var
|
|||||||
Duration: Single;
|
Duration: Single;
|
||||||
RealTimeFactor: Single;
|
RealTimeFactor: Single;
|
||||||
begin
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
|
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
|
||||||
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
|
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
|
||||||
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';
|
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';
|
||||||
|
|||||||
3
pascal-api-examples/vad-with-non-streaming-asr/.gitignore
vendored
Normal file
3
pascal-api-examples/vad-with-non-streaming-asr/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
!run-*.sh
|
||||||
|
vad_with_whisper
|
||||||
|
vad_with_sense_voice
|
||||||
12
pascal-api-examples/vad-with-non-streaming-asr/README.md
Normal file
12
pascal-api-examples/vad-with-non-streaming-asr/README.md
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Introduction
|
||||||
|
|
||||||
|
|
||||||
|
This directory contains examples for how to use the VAD (voice activity detection)
|
||||||
|
with non-streaming speech recognition models.
|
||||||
|
|
||||||
|
|Directory| Description|
|
||||||
|
|---------|------------|
|
||||||
|
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
|
||||||
|
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
|
||||||
|
|
||||||
|
Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
|
||||||
48
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh
Executable file
48
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh
Executable file
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||||
|
|
||||||
|
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||||
|
|
||||||
|
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||||
|
mkdir -p ../../build
|
||||||
|
pushd ../../build
|
||||||
|
cmake \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=./install \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
..
|
||||||
|
|
||||||
|
cmake --build . --target install --config Release
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./lei-jun-test.wav ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
|
||||||
|
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
|
||||||
|
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
fpc \
|
||||||
|
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||||
|
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||||
|
./vad_with_sense_voice.pas
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||||
|
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
|
||||||
|
./vad_with_sense_voice
|
||||||
49
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh
Executable file
49
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh
Executable file
@@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||||
|
|
||||||
|
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||||
|
|
||||||
|
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||||
|
mkdir -p ../../build
|
||||||
|
pushd ../../build
|
||||||
|
cmake \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=./install \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
..
|
||||||
|
|
||||||
|
cmake --build . --target install --config Release
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./Obama.wav ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
|
||||||
|
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
rm sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||||
|
fi
|
||||||
|
|
||||||
|
fpc \
|
||||||
|
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||||
|
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||||
|
./vad_with_whisper.pas
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||||
|
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
|
||||||
|
./vad_with_whisper
|
||||||
@@ -0,0 +1,137 @@
|
|||||||
|
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||||
|
|
||||||
|
{
|
||||||
|
This file shows how to use a non-streaming SenseVoice model
|
||||||
|
with silero VAD to decode files.
|
||||||
|
|
||||||
|
You can download the model files from
|
||||||
|
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||||
|
}
|
||||||
|
|
||||||
|
program vad_with_whisper;
|
||||||
|
|
||||||
|
{$mode objfpc}
|
||||||
|
|
||||||
|
uses
|
||||||
|
sherpa_onnx,
|
||||||
|
SysUtils;
|
||||||
|
|
||||||
|
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
|
||||||
|
var
|
||||||
|
Config: TSherpaOnnxVadModelConfig;
|
||||||
|
|
||||||
|
SampleRate: Integer;
|
||||||
|
WindowSize: Integer;
|
||||||
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
|
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||||
|
WindowSize := 512; {Please don't change it unless you know the details}
|
||||||
|
|
||||||
|
Config.SileroVad.Model := './silero_vad.onnx';
|
||||||
|
Config.SileroVad.MinSpeechDuration := 0.5;
|
||||||
|
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||||
|
Config.SileroVad.Threshold := 0.5;
|
||||||
|
Config.SileroVad.WindowSize := WindowSize;
|
||||||
|
Config.NumThreads:= 1;
|
||||||
|
Config.Debug:= True;
|
||||||
|
Config.Provider:= 'cpu';
|
||||||
|
Config.SampleRate := SampleRate;
|
||||||
|
|
||||||
|
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
|
||||||
|
var
|
||||||
|
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||||
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
|
Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
|
||||||
|
Config.ModelConfig.SenseVoice.Language := 'auto';
|
||||||
|
Config.ModelConfig.SenseVoice.UseItn := False;
|
||||||
|
Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt';
|
||||||
|
Config.ModelConfig.Provider := 'cpu';
|
||||||
|
Config.ModelConfig.NumThreads := 1;
|
||||||
|
Config.ModelConfig.Debug := False;
|
||||||
|
|
||||||
|
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||||
|
end;
|
||||||
|
|
||||||
|
var
|
||||||
|
Wave: TSherpaOnnxWave;
|
||||||
|
|
||||||
|
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||||
|
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||||
|
|
||||||
|
Offset: Integer;
|
||||||
|
WindowSize: Integer;
|
||||||
|
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||||
|
|
||||||
|
Start: Single;
|
||||||
|
Duration: Single;
|
||||||
|
|
||||||
|
Stream: TSherpaOnnxOfflineStream;
|
||||||
|
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||||
|
begin
|
||||||
|
Vad := CreateVad();
|
||||||
|
Recognizer := CreateOfflineRecognizer();
|
||||||
|
|
||||||
|
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
|
||||||
|
if Wave.SampleRate <> Vad.Config.SampleRate then
|
||||||
|
begin
|
||||||
|
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||||
|
[Vad.Config.SampleRate, Wave.SampleRate]));
|
||||||
|
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
WindowSize := Vad.Config.SileroVad.WindowSize;
|
||||||
|
Offset := 0;
|
||||||
|
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||||
|
begin
|
||||||
|
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||||
|
Offset += WindowSize;
|
||||||
|
|
||||||
|
while not Vad.IsEmpty do
|
||||||
|
begin
|
||||||
|
SpeechSegment := Vad.Front();
|
||||||
|
Vad.Pop();
|
||||||
|
Stream := Recognizer.CreateStream();
|
||||||
|
|
||||||
|
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||||
|
Recognizer.Decode(Stream);
|
||||||
|
RecognitionResult := Recognizer.GetResult(Stream);
|
||||||
|
|
||||||
|
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||||
|
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||||
|
WriteLn(Format('%.3f -- %.3f %s',
|
||||||
|
[Start, Start + Duration, RecognitionResult.Text]));
|
||||||
|
|
||||||
|
FreeAndNil(Stream);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
Vad.Flush;
|
||||||
|
|
||||||
|
while not Vad.IsEmpty do
|
||||||
|
begin
|
||||||
|
SpeechSegment := Vad.Front();
|
||||||
|
Vad.Pop();
|
||||||
|
Stream := Recognizer.CreateStream();
|
||||||
|
|
||||||
|
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||||
|
Recognizer.Decode(Stream);
|
||||||
|
RecognitionResult := Recognizer.GetResult(Stream);
|
||||||
|
|
||||||
|
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||||
|
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||||
|
WriteLn(Format('%.3f -- %.3f %s',
|
||||||
|
[Start, Start + Duration, RecognitionResult.Text]));
|
||||||
|
|
||||||
|
FreeAndNil(Stream);
|
||||||
|
end;
|
||||||
|
|
||||||
|
FreeAndNil(Recognizer);
|
||||||
|
FreeAndNil(Vad);
|
||||||
|
end.
|
||||||
@@ -0,0 +1,136 @@
|
|||||||
|
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||||
|
|
||||||
|
{
|
||||||
|
This file shows how to use a non-streaming Whisper model
|
||||||
|
with silero VAD to decode files.
|
||||||
|
|
||||||
|
You can download the model files from
|
||||||
|
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||||
|
}
|
||||||
|
|
||||||
|
program vad_with_whisper;
|
||||||
|
|
||||||
|
{$mode objfpc}
|
||||||
|
|
||||||
|
uses
|
||||||
|
sherpa_onnx,
|
||||||
|
SysUtils;
|
||||||
|
|
||||||
|
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
|
||||||
|
var
|
||||||
|
Config: TSherpaOnnxVadModelConfig;
|
||||||
|
|
||||||
|
SampleRate: Integer;
|
||||||
|
WindowSize: Integer;
|
||||||
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
|
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||||
|
WindowSize := 512; {Please don't change it unless you know the details}
|
||||||
|
|
||||||
|
Config.SileroVad.Model := './silero_vad.onnx';
|
||||||
|
Config.SileroVad.MinSpeechDuration := 0.5;
|
||||||
|
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||||
|
Config.SileroVad.Threshold := 0.5;
|
||||||
|
Config.SileroVad.WindowSize := WindowSize;
|
||||||
|
Config.NumThreads:= 1;
|
||||||
|
Config.Debug:= True;
|
||||||
|
Config.Provider:= 'cpu';
|
||||||
|
Config.SampleRate := SampleRate;
|
||||||
|
|
||||||
|
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
|
||||||
|
var
|
||||||
|
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||||
|
begin
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
|
Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||||
|
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||||
|
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||||
|
Config.ModelConfig.Provider := 'cpu';
|
||||||
|
Config.ModelConfig.NumThreads := 1;
|
||||||
|
Config.ModelConfig.Debug := False;
|
||||||
|
|
||||||
|
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||||
|
end;
|
||||||
|
|
||||||
|
var
|
||||||
|
Wave: TSherpaOnnxWave;
|
||||||
|
|
||||||
|
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||||
|
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||||
|
|
||||||
|
Offset: Integer;
|
||||||
|
WindowSize: Integer;
|
||||||
|
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||||
|
|
||||||
|
Start: Single;
|
||||||
|
Duration: Single;
|
||||||
|
|
||||||
|
Stream: TSherpaOnnxOfflineStream;
|
||||||
|
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||||
|
begin
|
||||||
|
Vad := CreateVad();
|
||||||
|
Recognizer := CreateOfflineRecognizer();
|
||||||
|
|
||||||
|
Wave := SherpaOnnxReadWave('./Obama.wav');
|
||||||
|
if Wave.SampleRate <> Vad.Config.SampleRate then
|
||||||
|
begin
|
||||||
|
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||||
|
[Vad.Config.SampleRate, Wave.SampleRate]));
|
||||||
|
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
WindowSize := Vad.Config.SileroVad.WindowSize;
|
||||||
|
Offset := 0;
|
||||||
|
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||||
|
begin
|
||||||
|
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||||
|
Offset += WindowSize;
|
||||||
|
|
||||||
|
while not Vad.IsEmpty do
|
||||||
|
begin
|
||||||
|
SpeechSegment := Vad.Front();
|
||||||
|
Vad.Pop();
|
||||||
|
Stream := Recognizer.CreateStream();
|
||||||
|
|
||||||
|
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||||
|
Recognizer.Decode(Stream);
|
||||||
|
RecognitionResult := Recognizer.GetResult(Stream);
|
||||||
|
|
||||||
|
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||||
|
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||||
|
WriteLn(Format('%.3f -- %.3f %s',
|
||||||
|
[Start, Start + Duration, RecognitionResult.Text]));
|
||||||
|
|
||||||
|
FreeAndNil(Stream);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
Vad.Flush;
|
||||||
|
|
||||||
|
while not Vad.IsEmpty do
|
||||||
|
begin
|
||||||
|
SpeechSegment := Vad.Front();
|
||||||
|
Vad.Pop();
|
||||||
|
Stream := Recognizer.CreateStream();
|
||||||
|
|
||||||
|
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||||
|
Recognizer.Decode(Stream);
|
||||||
|
RecognitionResult := Recognizer.GetResult(Stream);
|
||||||
|
|
||||||
|
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||||
|
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||||
|
WriteLn(Format('%.3f -- %.3f %s',
|
||||||
|
[Start, Start + Duration, RecognitionResult.Text]));
|
||||||
|
|
||||||
|
FreeAndNil(Stream);
|
||||||
|
end;
|
||||||
|
|
||||||
|
FreeAndNil(Recognizer);
|
||||||
|
FreeAndNil(Vad);
|
||||||
|
end.
|
||||||
3
pascal-api-examples/vad/.gitignore
vendored
Normal file
3
pascal-api-examples/vad/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
!run*.sh
|
||||||
|
circular_buffer
|
||||||
|
remove_silence
|
||||||
11
pascal-api-examples/vad/README.md
Normal file
11
pascal-api-examples/vad/README.md
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# Introduction
|
||||||
|
|
||||||
|
|
||||||
|
This directory contains examples for how to use the VAD (voice activity detection)
|
||||||
|
APIs.
|
||||||
|
|
||||||
|
|Directory| Description|
|
||||||
|
|---------|------------|
|
||||||
|
|[run-circular-buffer.sh](./run-circular-buffer.sh)|It shows how to use the circular buffer API.|
|
||||||
|
|[run-remove-silence.sh](./run-remove-silence.sh)|It shows how to use the VAD API to remove silences from a wave file.|
|
||||||
|
|
||||||
106
pascal-api-examples/vad/circular_buffer.pas
Normal file
106
pascal-api-examples/vad/circular_buffer.pas
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||||
|
program circular_buffer;
|
||||||
|
{
|
||||||
|
This file shows how to use the CircularBuffer API of sherpa-onnx
|
||||||
|
}
|
||||||
|
|
||||||
|
{$mode objfpc}
|
||||||
|
{$ASSERTIONS ON}
|
||||||
|
|
||||||
|
uses
|
||||||
|
sherpa_onnx;
|
||||||
|
|
||||||
|
var
|
||||||
|
Buffer: TSherpaOnnxCircularBuffer;
|
||||||
|
Samples: TSherpaOnnxSamplesArray;
|
||||||
|
begin
|
||||||
|
{The initial capacity is 5. It will be resized automatically if needed.}
|
||||||
|
Buffer := TSherpaOnnxCircularBuffer.Create(5);
|
||||||
|
Assert(Buffer.Size = 0);
|
||||||
|
Assert(Buffer.Head = 0);
|
||||||
|
Buffer.Push([0, 10, 20]);
|
||||||
|
|
||||||
|
{Push() changes Size. Head is not changed.}
|
||||||
|
Assert(Buffer.Size = 3);
|
||||||
|
Assert(Buffer.Head = 0);
|
||||||
|
|
||||||
|
Samples := Buffer.Get(0, 1);
|
||||||
|
Assert(Length(Samples) = 1);
|
||||||
|
Assert(Samples[0] = 0);
|
||||||
|
|
||||||
|
{ Get() does not change Size or Head}
|
||||||
|
Assert(Buffer.Size = 3);
|
||||||
|
Assert(Buffer.Head = 0);
|
||||||
|
|
||||||
|
Samples := Buffer.Get(0, 2);
|
||||||
|
Assert(Length(Samples) = 2);
|
||||||
|
Assert(Samples[0] = 0);
|
||||||
|
Assert(Samples[1] = 10);
|
||||||
|
|
||||||
|
{ The buffer will be resized since its initial capacity is 5 but we have
|
||||||
|
pushed 7 elements into it.
|
||||||
|
|
||||||
|
No data is lost during the resize.
|
||||||
|
}
|
||||||
|
Buffer.Push([30, 40, 50, 60]);
|
||||||
|
|
||||||
|
Assert(Buffer.Size = 7); {There are now 7 elements}
|
||||||
|
Assert(Buffer.Head = 0);
|
||||||
|
|
||||||
|
{Remove the first 4 elements}
|
||||||
|
Buffer.Pop(4);
|
||||||
|
|
||||||
|
Assert(Buffer.Size = 3); {There are only 3 elements left}
|
||||||
|
Assert(Buffer.Head = 4);
|
||||||
|
|
||||||
|
Samples := Buffer.Get(Buffer.Head, 2);
|
||||||
|
Assert(Length(Samples) = 2);
|
||||||
|
Assert(Samples[0] = 40);
|
||||||
|
Assert(Samples[1] = 50);
|
||||||
|
|
||||||
|
Buffer.Pop(1);
|
||||||
|
|
||||||
|
Assert(Buffer.Size = 2); {There are only 2 elements left}
|
||||||
|
Assert(Buffer.Head = 5);
|
||||||
|
|
||||||
|
Samples := Buffer.Get(Buffer.Head, 2);
|
||||||
|
Assert(Length(Samples) = 2);
|
||||||
|
Assert(Samples[0] = 50);
|
||||||
|
Assert(Samples[1] = 60);
|
||||||
|
|
||||||
|
Buffer.Pop(2);
|
||||||
|
Assert(Buffer.Size = 0); {There are no elements left}
|
||||||
|
Assert(Buffer.Head = 7);
|
||||||
|
|
||||||
|
Buffer.Push([100, 200, 300, 400, 500]);
|
||||||
|
Assert(Buffer.Size = 5);
|
||||||
|
Assert(Buffer.Head = 7);
|
||||||
|
|
||||||
|
Buffer.Pop(4);
|
||||||
|
Assert(Buffer.Size = 1);
|
||||||
|
|
||||||
|
{Head can be larger than the Capacity!
|
||||||
|
This is what circular means. It points to Buffer.Head / Capacity.
|
||||||
|
}
|
||||||
|
Assert(Buffer.Head = 11);
|
||||||
|
Buffer.Push([600, 700]);
|
||||||
|
|
||||||
|
Assert(Buffer.Size = 3);
|
||||||
|
Assert(Buffer.Head = 11);
|
||||||
|
|
||||||
|
Samples := Buffer.Get(Buffer.Head, 3);
|
||||||
|
Assert(Length(Samples) = 3);
|
||||||
|
Assert(Samples[0] = 500);
|
||||||
|
Assert(Samples[1] = 600);
|
||||||
|
Assert(Samples[2] = 700);
|
||||||
|
|
||||||
|
Buffer.Pop(3);
|
||||||
|
Assert(Buffer.Size = 0);
|
||||||
|
Assert(Buffer.Head = 14);
|
||||||
|
|
||||||
|
Buffer.Reset();
|
||||||
|
|
||||||
|
Assert(Buffer.Size = 0);
|
||||||
|
Assert(Buffer.Head = 0);
|
||||||
|
end.
|
||||||
|
|
||||||
115
pascal-api-examples/vad/remove_silence.pas
Normal file
115
pascal-api-examples/vad/remove_silence.pas
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||||
|
{
|
||||||
|
This file shows how to use the VAD API from sherpa-onnx
|
||||||
|
to remove silences from a wave file.
|
||||||
|
}
|
||||||
|
program main;
|
||||||
|
|
||||||
|
{$mode delphi}
|
||||||
|
|
||||||
|
uses
|
||||||
|
sherpa_onnx,
|
||||||
|
SysUtils;
|
||||||
|
|
||||||
|
var
|
||||||
|
Wave: TSherpaOnnxWave;
|
||||||
|
|
||||||
|
Config: TSherpaOnnxVadModelConfig;
|
||||||
|
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||||
|
Offset: Integer;
|
||||||
|
WindowSize: Integer;
|
||||||
|
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||||
|
|
||||||
|
Start: Single;
|
||||||
|
Duration: Single;
|
||||||
|
SampleRate: Integer;
|
||||||
|
|
||||||
|
AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
|
||||||
|
AllSamples: array of Single;
|
||||||
|
N: Integer;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||||
|
|
||||||
|
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
|
||||||
|
if Wave.SampleRate <> SampleRate then
|
||||||
|
begin
|
||||||
|
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||||
|
[SampleRate, Wave.SampleRate]));
|
||||||
|
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
WindowSize := 512; {Please don't change it unless you know the details}
|
||||||
|
Initialize(Config);
|
||||||
|
|
||||||
|
Config.SileroVad.Model := './silero_vad.onnx';
|
||||||
|
Config.SileroVad.MinSpeechDuration := 0.25;
|
||||||
|
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||||
|
Config.SileroVad.Threshold := 0.5;
|
||||||
|
Config.SileroVad.WindowSize := WindowSize;
|
||||||
|
Config.NumThreads:= 1;
|
||||||
|
Config.Debug:= True;
|
||||||
|
Config.Provider:= 'cpu';
|
||||||
|
Config.SampleRate := SampleRate;
|
||||||
|
|
||||||
|
Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
|
||||||
|
|
||||||
|
AllSpeechSegment := nil;
|
||||||
|
AllSamples := nil;
|
||||||
|
Offset := 0;
|
||||||
|
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||||
|
begin
|
||||||
|
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||||
|
Inc(Offset, WindowSize);
|
||||||
|
|
||||||
|
while not Vad.IsEmpty do
|
||||||
|
begin
|
||||||
|
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
|
||||||
|
|
||||||
|
SpeechSegment := Vad.Front();
|
||||||
|
Vad.Pop();
|
||||||
|
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
|
||||||
|
|
||||||
|
Start := SpeechSegment.Start / SampleRate;
|
||||||
|
Duration := Length(SpeechSegment.Samples) / SampleRate;
|
||||||
|
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
Vad.Flush;
|
||||||
|
|
||||||
|
while not Vad.IsEmpty do
|
||||||
|
begin
|
||||||
|
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
|
||||||
|
|
||||||
|
SpeechSegment := Vad.Front();
|
||||||
|
Vad.Pop();
|
||||||
|
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
|
||||||
|
|
||||||
|
Start := SpeechSegment.Start / SampleRate;
|
||||||
|
Duration := Length(SpeechSegment.Samples) / SampleRate;
|
||||||
|
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
|
||||||
|
end;
|
||||||
|
|
||||||
|
N := 0;
|
||||||
|
for SpeechSegment in AllSpeechSegment do
|
||||||
|
Inc(N, Length(SpeechSegment.Samples));
|
||||||
|
|
||||||
|
SetLength(AllSamples, N);
|
||||||
|
|
||||||
|
N := 0;
|
||||||
|
for SpeechSegment in AllSpeechSegment do
|
||||||
|
begin
|
||||||
|
for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
|
||||||
|
begin
|
||||||
|
AllSamples[N] := SpeechSegment.Samples[I];
|
||||||
|
Inc(N);
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate);
|
||||||
|
WriteLn('Saved to ./lei-jun-test-no-silence.wav');
|
||||||
|
|
||||||
|
FreeAndNil(Vad);
|
||||||
|
end.
|
||||||
34
pascal-api-examples/vad/run-circular-buffer.sh
Executable file
34
pascal-api-examples/vad/run-circular-buffer.sh
Executable file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||||
|
|
||||||
|
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||||
|
|
||||||
|
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||||
|
mkdir -p ../../build
|
||||||
|
pushd ../../build
|
||||||
|
cmake \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=./install \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
..
|
||||||
|
|
||||||
|
cmake --build . --target install --config Release
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
fpc \
|
||||||
|
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||||
|
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||||
|
./circular_buffer.pas
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||||
|
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
|
||||||
|
./circular_buffer
|
||||||
42
pascal-api-examples/vad/run-remove-silence.sh
Executable file
42
pascal-api-examples/vad/run-remove-silence.sh
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||||
|
|
||||||
|
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||||
|
|
||||||
|
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||||
|
mkdir -p ../../build
|
||||||
|
pushd ../../build
|
||||||
|
cmake \
|
||||||
|
-DCMAKE_INSTALL_PREFIX=./install \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||||
|
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||||
|
-DBUILD_SHARED_LIBS=ON \
|
||||||
|
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||||
|
..
|
||||||
|
|
||||||
|
cmake --build . --target install --config Release
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./lei-jun-test.wav ]; then
|
||||||
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
|
||||||
|
fi
|
||||||
|
|
||||||
|
fpc \
|
||||||
|
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||||
|
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||||
|
./remove_silence.pas
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||||
|
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||||
|
|
||||||
|
./remove_silence
|
||||||
@@ -95,6 +95,8 @@ void CircularBuffer::Push(const float *p, int32_t n) {
|
|||||||
"capacity to: %d",
|
"capacity to: %d",
|
||||||
n, size, n + size, capacity, new_capacity);
|
n, size, n + size, capacity, new_capacity);
|
||||||
Resize(new_capacity);
|
Resize(new_capacity);
|
||||||
|
|
||||||
|
capacity = new_capacity;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t start = tail_ % capacity;
|
int32_t start = tail_ % capacity;
|
||||||
|
|||||||
@@ -2,9 +2,11 @@
|
|||||||
|
|
||||||
unit sherpa_onnx;
|
unit sherpa_onnx;
|
||||||
|
|
||||||
{$mode objfpc}
|
{$IFDEF FPC}
|
||||||
|
{$mode objfpc}
|
||||||
|
{$modeSwitch advancedRecords} { to support records with methods }
|
||||||
|
{$ENDIF}
|
||||||
|
|
||||||
{$modeSwitch advancedRecords} { to support records with methods }
|
|
||||||
(* {$LongStrings ON} *)
|
(* {$LongStrings ON} *)
|
||||||
|
|
||||||
interface
|
interface
|
||||||
@@ -45,18 +47,21 @@ type
|
|||||||
ModelingUnit: AnsiString;
|
ModelingUnit: AnsiString;
|
||||||
BpeVocab: AnsiString;
|
BpeVocab: AnsiString;
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxFeatureConfig = record
|
TSherpaOnnxFeatureConfig = record
|
||||||
SampleRate: Integer;
|
SampleRate: Integer;
|
||||||
FeatureDim: Integer;
|
FeatureDim: Integer;
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxOnlineCtcFstDecoderConfig = record
|
TSherpaOnnxOnlineCtcFstDecoderConfig = record
|
||||||
Graph: AnsiString;
|
Graph: AnsiString;
|
||||||
MaxActive: Integer;
|
MaxActive: Integer;
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxOnlineRecognizerConfig = record
|
TSherpaOnnxOnlineRecognizerConfig = record
|
||||||
@@ -75,6 +80,7 @@ type
|
|||||||
RuleFars: AnsiString;
|
RuleFars: AnsiString;
|
||||||
BlankPenalty: Single;
|
BlankPenalty: Single;
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxOnlineRecognizerResult = record
|
TSherpaOnnxOnlineRecognizerResult = record
|
||||||
@@ -97,6 +103,7 @@ type
|
|||||||
TSherpaOnnxOnlineRecognizer = class
|
TSherpaOnnxOnlineRecognizer = class
|
||||||
private
|
private
|
||||||
Handle: Pointer;
|
Handle: Pointer;
|
||||||
|
_Config: TSherpaOnnxOnlineRecognizerConfig;
|
||||||
public
|
public
|
||||||
constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig);
|
constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig);
|
||||||
destructor Destroy; override;
|
destructor Destroy; override;
|
||||||
@@ -108,6 +115,7 @@ type
|
|||||||
procedure Reset(Stream: TSherpaOnnxOnlineStream);
|
procedure Reset(Stream: TSherpaOnnxOnlineStream);
|
||||||
function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
|
function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean;
|
||||||
function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
|
function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult;
|
||||||
|
property Config: TSherpaOnnxOnlineRecognizerConfig Read _Config;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxOfflineTransducerModelConfig = record
|
TSherpaOnnxOfflineTransducerModelConfig = record
|
||||||
@@ -134,6 +142,7 @@ type
|
|||||||
Task: AnsiString;
|
Task: AnsiString;
|
||||||
TailPaddings: Integer;
|
TailPaddings: Integer;
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxOfflineTdnnModelConfig = record
|
TSherpaOnnxOfflineTdnnModelConfig = record
|
||||||
@@ -145,12 +154,14 @@ type
|
|||||||
Model: AnsiString;
|
Model: AnsiString;
|
||||||
Scale: Single;
|
Scale: Single;
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
TSherpaOnnxOfflineSenseVoiceModelConfig = record
|
TSherpaOnnxOfflineSenseVoiceModelConfig = record
|
||||||
Model: AnsiString;
|
Model: AnsiString;
|
||||||
Language: AnsiString;
|
Language: AnsiString;
|
||||||
UseItn: Boolean;
|
UseItn: Boolean;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
@@ -169,6 +180,7 @@ type
|
|||||||
BpeVocab: AnsiString;
|
BpeVocab: AnsiString;
|
||||||
TeleSpeechCtc: AnsiString;
|
TeleSpeechCtc: AnsiString;
|
||||||
SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
|
SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
@@ -183,6 +195,7 @@ type
|
|||||||
RuleFsts: AnsiString;
|
RuleFsts: AnsiString;
|
||||||
RuleFars: AnsiString;
|
RuleFars: AnsiString;
|
||||||
BlankPenalty: Single;
|
BlankPenalty: Single;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
|
||||||
function ToString: AnsiString;
|
function ToString: AnsiString;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
@@ -205,18 +218,83 @@ type
|
|||||||
TSherpaOnnxOfflineRecognizer = class
|
TSherpaOnnxOfflineRecognizer = class
|
||||||
private
|
private
|
||||||
Handle: Pointer;
|
Handle: Pointer;
|
||||||
|
_Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||||
public
|
public
|
||||||
constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig);
|
constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig);
|
||||||
destructor Destroy; override;
|
destructor Destroy; override;
|
||||||
function CreateStream: TSherpaOnnxOfflineStream;
|
function CreateStream: TSherpaOnnxOfflineStream;
|
||||||
procedure Decode(Stream: TSherpaOnnxOfflineStream);
|
procedure Decode(Stream: TSherpaOnnxOfflineStream);
|
||||||
function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
|
function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult;
|
||||||
|
property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
{ It supports reading a single channel wave with 16-bit encoded samples.
|
TSherpaOnnxSileroVadModelConfig = record
|
||||||
Samples are normalized to the range [-1, 1].
|
Model: AnsiString;
|
||||||
}
|
Threshold: Single;
|
||||||
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
|
MinSilenceDuration: Single;
|
||||||
|
MinSpeechDuration: Single;
|
||||||
|
WindowSize: Integer;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxVadModelConfig = record
|
||||||
|
SileroVad: TSherpaOnnxSileroVadModelConfig;
|
||||||
|
SampleRate: Integer;
|
||||||
|
NumThreads: Integer;
|
||||||
|
Provider: AnsiString;
|
||||||
|
Debug: Boolean;
|
||||||
|
function ToString: AnsiString;
|
||||||
|
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxSamplesArray = array of Single;
|
||||||
|
|
||||||
|
TSherpaOnnxCircularBuffer = class
|
||||||
|
private
|
||||||
|
Handle: Pointer;
|
||||||
|
public
|
||||||
|
constructor Create(Capacity: Integer);
|
||||||
|
destructor Destroy; override;
|
||||||
|
procedure Push(Samples: array of Single);
|
||||||
|
function Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
|
||||||
|
procedure Pop(N: Integer);
|
||||||
|
procedure Reset;
|
||||||
|
function Size: Integer;
|
||||||
|
function Head: Integer;
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxSpeechSegment = record
|
||||||
|
Samples: array of Single;
|
||||||
|
Start: Integer;
|
||||||
|
end;
|
||||||
|
|
||||||
|
TSherpaOnnxVoiceActivityDetector = class
|
||||||
|
private
|
||||||
|
Handle: Pointer;
|
||||||
|
_Config: TSherpaOnnxVadModelConfig;
|
||||||
|
public
|
||||||
|
constructor Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
|
||||||
|
destructor Destroy; override;
|
||||||
|
procedure AcceptWaveform(Samples: array of Single); overload;
|
||||||
|
procedure AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); overload;
|
||||||
|
function IsEmpty: Boolean;
|
||||||
|
function IsDetected: Boolean;
|
||||||
|
procedure Pop;
|
||||||
|
procedure Clear;
|
||||||
|
function Front: TSherpaOnnxSpeechSegment;
|
||||||
|
procedure Reset;
|
||||||
|
procedure Flush;
|
||||||
|
property Config: TSherpaOnnxVadModelConfig Read _Config;
|
||||||
|
end;
|
||||||
|
|
||||||
|
{ It supports reading a single channel wave with 16-bit encoded samples.
|
||||||
|
Samples are normalized to the range [-1, 1].
|
||||||
|
}
|
||||||
|
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
|
||||||
|
|
||||||
|
function SherpaOnnxWriteWave(Filename: AnsiString;
|
||||||
|
Samples: array of Single; SampleRate: Integer): Boolean;
|
||||||
|
|
||||||
implementation
|
implementation
|
||||||
|
|
||||||
@@ -294,15 +372,15 @@ type
|
|||||||
DecodingMethod: PAnsiChar;
|
DecodingMethod: PAnsiChar;
|
||||||
MaxActivePaths: cint32;
|
MaxActivePaths: cint32;
|
||||||
EnableEndpoint: cint32;
|
EnableEndpoint: cint32;
|
||||||
Rule1MinTrailingSilence: Single;
|
Rule1MinTrailingSilence: cfloat;
|
||||||
Rule2MinTrailingSilence: Single;
|
Rule2MinTrailingSilence: cfloat;
|
||||||
Rule3MinUtteranceLength: Single;
|
Rule3MinUtteranceLength: cfloat;
|
||||||
HotwordsFile: PAnsiChar;
|
HotwordsFile: PAnsiChar;
|
||||||
HotwordsScore: Single;
|
HotwordsScore: cfloat;
|
||||||
CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig;
|
CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig;
|
||||||
RuleFsts: PAnsiChar;
|
RuleFsts: PAnsiChar;
|
||||||
RuleFars: PAnsiChar;
|
RuleFars: PAnsiChar;
|
||||||
BlankPenalty: Single;
|
BlankPenalty: cfloat;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig;
|
PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig;
|
||||||
@@ -330,7 +408,7 @@ type
|
|||||||
end;
|
end;
|
||||||
SherpaOnnxOfflineLMConfig = record
|
SherpaOnnxOfflineLMConfig = record
|
||||||
Model: PAnsiChar;
|
Model: PAnsiChar;
|
||||||
Scale: Single;
|
Scale: cfloat;
|
||||||
end;
|
end;
|
||||||
SherpaOnnxOfflineSenseVoiceModelConfig = record
|
SherpaOnnxOfflineSenseVoiceModelConfig = record
|
||||||
Model: PAnsiChar;
|
Model: PAnsiChar;
|
||||||
@@ -361,14 +439,100 @@ type
|
|||||||
DecodingMethod: PAnsiChar;
|
DecodingMethod: PAnsiChar;
|
||||||
MaxActivePaths: cint32;
|
MaxActivePaths: cint32;
|
||||||
HotwordsFile: PAnsiChar;
|
HotwordsFile: PAnsiChar;
|
||||||
HotwordsScore: Single;
|
HotwordsScore: cfloat;
|
||||||
RuleFsts: PAnsiChar;
|
RuleFsts: PAnsiChar;
|
||||||
RuleFars: PAnsiChar;
|
RuleFars: PAnsiChar;
|
||||||
BlankPenalty: Single;
|
BlankPenalty: cfloat;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig;
|
PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig;
|
||||||
|
|
||||||
|
SherpaOnnxSileroVadModelConfig = record
|
||||||
|
Model: PAnsiChar;
|
||||||
|
Threshold: cfloat;
|
||||||
|
MinSilenceDuration: cfloat;
|
||||||
|
MinSpeechDuration: cfloat;
|
||||||
|
WindowSize: cint32;
|
||||||
|
end;
|
||||||
|
SherpaOnnxVadModelConfig = record
|
||||||
|
SileroVad: SherpaOnnxSileroVadModelConfig;
|
||||||
|
SampleRate: cint32;
|
||||||
|
NumThreads: cint32;
|
||||||
|
Provider: PAnsiChar;
|
||||||
|
Debug: cint32;
|
||||||
|
end;
|
||||||
|
PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig;
|
||||||
|
|
||||||
|
SherpaOnnxSpeechSegment = record
|
||||||
|
Start: cint32;
|
||||||
|
Samples: pcfloat;
|
||||||
|
N: cint32;
|
||||||
|
end;
|
||||||
|
|
||||||
|
PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment;
|
||||||
|
|
||||||
|
function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig;
|
||||||
|
BufferSizeInSeconds: cfloat): Pointer; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxDestroyVoiceActivityDetector(Vad: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxVoiceActivityDetectorAcceptWaveform(Vad: Pointer;
|
||||||
|
Samples: pcfloat; N: cint32); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxVoiceActivityDetectorEmpty(Vad: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxVoiceActivityDetectorDetected(Vad: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxVoiceActivityDetectorPop(Vad: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxVoiceActivityDetectorClear(Vad: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxVoiceActivityDetectorFront(Vad: Pointer): PSherpaOnnxSpeechSegment; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxDestroySpeechSegment(P: PSherpaOnnxSpeechSegment); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxVoiceActivityDetectorReset(P: PSherpaOnnxSpeechSegment); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxVoiceActivityDetectorFlush(P: PSherpaOnnxSpeechSegment); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxCreateCircularBuffer(Capacity: cint32): Pointer; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxDestroyCircularBuffer(Buffer: Pointer) ; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxCircularBufferPush(Buffer: Pointer; Samples: pcfloat; N: cint32); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxCircularBufferGet(Buffer: Pointer; StartIndex: cint32; N: cint32): pcfloat ; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxCircularBufferFree(P: pcfloat); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxCircularBufferPop(Buffer: Pointer; N: cint32); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxCircularBufferSize(Buffer: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
function SherpaOnnxCircularBufferHead(Buffer: Pointer): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
|
procedure SherpaOnnxCircularBufferReset(Buffer: Pointer); cdecl;
|
||||||
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl;
|
function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl;
|
||||||
external SherpaOnnxLibName;
|
external SherpaOnnxLibName;
|
||||||
|
|
||||||
@@ -437,9 +601,20 @@ procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl;
|
|||||||
function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl;
|
function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl;
|
||||||
external SherpaOnnxLibName name 'SherpaOnnxReadWave';
|
external SherpaOnnxLibName name 'SherpaOnnxReadWave';
|
||||||
|
|
||||||
|
function SherpaOnnxWriteWaveWrapper(Samples: pcfloat; N: cint32;
|
||||||
|
SampleRate: cint32; Filename: PAnsiChar): cint32; cdecl;
|
||||||
|
external SherpaOnnxLibName name 'SherpaOnnxWriteWave';
|
||||||
|
|
||||||
procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl;
|
procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl;
|
||||||
external SherpaOnnxLibName name 'SherpaOnnxFreeWave';
|
external SherpaOnnxLibName name 'SherpaOnnxFreeWave';
|
||||||
|
|
||||||
|
function SherpaOnnxWriteWave(Filename: AnsiString;
|
||||||
|
Samples: array of Single; SampleRate: Integer): Boolean;
|
||||||
|
begin
|
||||||
|
Result := SherpaOnnxWriteWaveWrapper(pcfloat(Samples), Length(Samples),
|
||||||
|
SampleRate, PAnsiChar(Filename)) = 1;
|
||||||
|
end;
|
||||||
|
|
||||||
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
|
function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave;
|
||||||
var
|
var
|
||||||
PFilename: PAnsiChar;
|
PFilename: PAnsiChar;
|
||||||
@@ -611,6 +786,7 @@ begin
|
|||||||
C.BlankPenalty := Config.BlankPenalty;
|
C.BlankPenalty := Config.BlankPenalty;
|
||||||
|
|
||||||
Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C);
|
Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C);
|
||||||
|
Self._Config := Config;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
destructor TSherpaOnnxOnlineRecognizer.Destroy;
|
destructor TSherpaOnnxOnlineRecognizer.Destroy;
|
||||||
@@ -877,6 +1053,7 @@ begin
|
|||||||
C.BlankPenalty := Config.BlankPenalty;
|
C.BlankPenalty := Config.BlankPenalty;
|
||||||
|
|
||||||
Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C);
|
Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C);
|
||||||
|
Self._Config := Config;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
destructor TSherpaOnnxOfflineRecognizer.Destroy;
|
destructor TSherpaOnnxOfflineRecognizer.Destroy;
|
||||||
@@ -984,5 +1161,255 @@ begin
|
|||||||
[Self.Text, TokensStr, TimestampStr]);
|
[Self.Text, TokensStr, TimestampStr]);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxSileroVadModelConfig(' +
|
||||||
|
'Model := %s, ' +
|
||||||
|
'Threshold := %.2f, ' +
|
||||||
|
'MinSilenceDuration := %.2f, ' +
|
||||||
|
'MinSpeechDuration := %.2f, ' +
|
||||||
|
'WindowSize := %d' +
|
||||||
|
')',
|
||||||
|
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
|
||||||
|
Self.MinSpeechDuration, Self.WindowSize
|
||||||
|
]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.Threshold := 0.5;
|
||||||
|
Dest.MinSilenceDuration := 0.5;
|
||||||
|
Dest.MinSpeechDuration := 0.25;
|
||||||
|
Dest.WindowSize := 512;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
|
||||||
|
begin
|
||||||
|
Result := Format('TSherpaOnnxVadModelConfig(' +
|
||||||
|
'SileroVad := %s, ' +
|
||||||
|
'SampleRate := %d, ' +
|
||||||
|
'NumThreads := %d, ' +
|
||||||
|
'Provider := %s, ' +
|
||||||
|
'Debug := %s' +
|
||||||
|
')',
|
||||||
|
[Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider,
|
||||||
|
Self.Debug.ToString
|
||||||
|
]);
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.SampleRate := 16000;
|
||||||
|
Dest.NumThreads := 1;
|
||||||
|
Dest.Provider := 'cpu';
|
||||||
|
Dest.Debug := False;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxFeatureConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig);
|
||||||
|
begin
|
||||||
|
Dest.SampleRate := 16000;
|
||||||
|
Dest.FeatureDim := 80;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOnlineCtcFstDecoderConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig);
|
||||||
|
begin
|
||||||
|
Dest.MaxActive := 3000;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOnlineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig);
|
||||||
|
begin
|
||||||
|
Dest.DecodingMethod := 'greedy_search';
|
||||||
|
Dest.EnableEndpoint := False;
|
||||||
|
Dest.Rule1MinTrailingSilence := 2.4;
|
||||||
|
Dest.Rule2MinTrailingSilence := 1.2;
|
||||||
|
Dest.Rule3MinUtteranceLength := 20;
|
||||||
|
Dest.HotwordsScore := 1.5;
|
||||||
|
Dest.BlankPenalty := 0;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOnlineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.NumThreads := 1;
|
||||||
|
Dest.Provider := 'cpu';
|
||||||
|
Dest.Debug := False;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineWhisperModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.Task := 'transcribe';
|
||||||
|
Dest.TailPaddings := -1;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig);
|
||||||
|
begin
|
||||||
|
Dest.Scale := 1.0;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineSenseVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.UseItn := True;
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
|
||||||
|
begin
|
||||||
|
Dest.NumThreads := 1;
|
||||||
|
Dest.Debug := False;
|
||||||
|
Dest.Provider := 'cpu';
|
||||||
|
end;
|
||||||
|
|
||||||
|
class operator TSherpaOnnxOfflineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig);
|
||||||
|
begin
|
||||||
|
Dest.DecodingMethod := 'greedy_search';
|
||||||
|
Dest.MaxActivePaths := 4;
|
||||||
|
Dest.HotwordsScore := 1.5;
|
||||||
|
Dest.BlankPenalty := 0;
|
||||||
|
end;
|
||||||
|
|
||||||
|
constructor TSherpaOnnxCircularBuffer.Create(Capacity: Integer);
|
||||||
|
begin
|
||||||
|
Self.Handle := SherpaOnnxCreateCircularBuffer(Capacity);
|
||||||
|
end;
|
||||||
|
|
||||||
|
destructor TSherpaOnnxCircularBuffer.Destroy;
|
||||||
|
begin
|
||||||
|
SherpaOnnxDestroyCircularBuffer(Self.Handle);
|
||||||
|
Self.Handle := nil;
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxCircularBuffer.Push(Samples: array of Single);
|
||||||
|
begin
|
||||||
|
SherpaOnnxCircularBufferPush(Self.Handle, pcfloat(Samples), Length(Samples));
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxCircularBuffer.Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray;
|
||||||
|
var
|
||||||
|
P: pcfloat;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
P := SherpaOnnxCircularBufferGet(Self.Handle, StartIndex, N);
|
||||||
|
|
||||||
|
Result := nil;
|
||||||
|
|
||||||
|
SetLength(Result, N);
|
||||||
|
|
||||||
|
for I := Low(Result) to High(Result) do
|
||||||
|
Result[I] := P[I];
|
||||||
|
|
||||||
|
SherpaOnnxCircularBufferFree(P);
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxCircularBuffer.Pop(N: Integer);
|
||||||
|
begin
|
||||||
|
SherpaOnnxCircularBufferPop(Self.Handle, N);
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxCircularBuffer.Reset;
|
||||||
|
begin
|
||||||
|
SherpaOnnxCircularBufferReset(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxCircularBuffer.Size: Integer;
|
||||||
|
begin
|
||||||
|
Result := SherpaOnnxCircularBufferSize(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxCircularBuffer.Head: Integer;
|
||||||
|
begin
|
||||||
|
Result := SherpaOnnxCircularBufferHead(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single);
|
||||||
|
var
|
||||||
|
C: SherpaOnnxVadModelConfig;
|
||||||
|
begin
|
||||||
|
Self._Config := Config;
|
||||||
|
|
||||||
|
Initialize(C);
|
||||||
|
|
||||||
|
C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model);
|
||||||
|
C.SileroVad.Threshold := Config.SileroVad.Threshold;
|
||||||
|
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
|
||||||
|
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
|
||||||
|
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
|
||||||
|
|
||||||
|
C.SampleRate := Config.SampleRate;
|
||||||
|
C.NumThreads := Config.NumThreads;
|
||||||
|
C.Provider := PAnsiChar(Config.Provider);
|
||||||
|
C.Debug := Ord(Config.Debug);
|
||||||
|
|
||||||
|
Self.Handle := SherpaOnnxCreateVoiceActivityDetector(@C, BufferSizeInSeconds);
|
||||||
|
end;
|
||||||
|
|
||||||
|
destructor TSherpaOnnxVoiceActivityDetector.Destroy;
|
||||||
|
begin
|
||||||
|
SherpaOnnxDestroyVoiceActivityDetector(Self.Handle);
|
||||||
|
Self.Handle := nil;
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single);
|
||||||
|
begin
|
||||||
|
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, pcfloat(Samples), Length(Samples));
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer);
|
||||||
|
begin
|
||||||
|
if Offset + N > Length(Samples) then
|
||||||
|
begin
|
||||||
|
WriteLn(Format('Invalid arguments!. Array length: %d, Offset: %d, N: %d',
|
||||||
|
[Length(Samples), Offset, N]
|
||||||
|
));
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
|
||||||
|
SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle,
|
||||||
|
pcfloat(Samples) + Offset, N);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxVoiceActivityDetector.IsEmpty: Boolean;
|
||||||
|
begin
|
||||||
|
Result := SherpaOnnxVoiceActivityDetectorEmpty(Self.Handle) = 1;
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxVoiceActivityDetector.IsDetected: Boolean;
|
||||||
|
begin
|
||||||
|
Result := SherpaOnnxVoiceActivityDetectorDetected(Self.Handle) = 1;
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxVoiceActivityDetector.Pop;
|
||||||
|
begin
|
||||||
|
SherpaOnnxVoiceActivityDetectorPop(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxVoiceActivityDetector.Clear;
|
||||||
|
begin
|
||||||
|
SherpaOnnxVoiceActivityDetectorClear(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
function TSherpaOnnxVoiceActivityDetector.Front: TSherpaOnnxSpeechSegment;
|
||||||
|
var
|
||||||
|
P: PSherpaOnnxSpeechSegment;
|
||||||
|
I: Integer;
|
||||||
|
begin
|
||||||
|
P := SherpaOnnxVoiceActivityDetectorFront(Self.Handle);
|
||||||
|
Result.Start := P^.Start;
|
||||||
|
Result.Samples := nil;
|
||||||
|
SetLength(Result.Samples, P^.N);
|
||||||
|
|
||||||
|
for I := Low(Result.Samples) to High(Result.Samples) do
|
||||||
|
Result.Samples[I] := P^.Samples[I];
|
||||||
|
|
||||||
|
SherpaOnnxDestroySpeechSegment(P);
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxVoiceActivityDetector.Reset;
|
||||||
|
begin
|
||||||
|
SherpaOnnxVoiceActivityDetectorReset(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure TSherpaOnnxVoiceActivityDetector.Flush;
|
||||||
|
begin
|
||||||
|
SherpaOnnxVoiceActivityDetectorFlush(Self.Handle);
|
||||||
|
end;
|
||||||
|
|
||||||
end.
|
end.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user