Pascal API for VAD (#1249)
This commit is contained in:
@@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx).
|
||||
|[read-wav](./read-wav)|It shows how to read a wave file.|
|
||||
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|
||||
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|
||||
|[vad](./vad)| It shows how to use the voice activity detection API.|
|
||||
|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.|
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx';
|
||||
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx';
|
||||
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx';
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
|
||||
Config.ModelConfig.SenseVoice.Language := 'auto';
|
||||
Config.ModelConfig.SenseVoice.UseItn := False;
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||
|
||||
@@ -33,6 +33,8 @@ var
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx';
|
||||
Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx';
|
||||
Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx';
|
||||
|
||||
3
pascal-api-examples/vad-with-non-streaming-asr/.gitignore
vendored
Normal file
3
pascal-api-examples/vad-with-non-streaming-asr/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
!run-*.sh
|
||||
vad_with_whisper
|
||||
vad_with_sense_voice
|
||||
12
pascal-api-examples/vad-with-non-streaming-asr/README.md
Normal file
12
pascal-api-examples/vad-with-non-streaming-asr/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Introduction
|
||||
|
||||
|
||||
This directory contains examples for how to use the VAD (voice activity detection)
|
||||
with non-streaming speech recognition models.
|
||||
|
||||
|Directory| Description|
|
||||
|---------|------------|
|
||||
|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.|
|
||||
|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.|
|
||||
|
||||
Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models.
|
||||
48
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh
Executable file
48
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
popd
|
||||
fi
|
||||
|
||||
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
fi
|
||||
|
||||
if [ ! -f ./lei-jun-test.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
|
||||
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
|
||||
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./vad_with_sense_voice.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./vad_with_sense_voice
|
||||
49
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh
Executable file
49
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
popd
|
||||
fi
|
||||
|
||||
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
fi
|
||||
|
||||
if [ ! -f ./Obama.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
rm sherpa-onnx-whisper-tiny.en.tar.bz2
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./vad_with_whisper.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./vad_with_whisper
|
||||
@@ -0,0 +1,137 @@
|
||||
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||
|
||||
{
|
||||
This file shows how to use a non-streaming SenseVoice model
|
||||
with silero VAD to decode files.
|
||||
|
||||
You can download the model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
}
|
||||
|
||||
program vad_with_whisper;
|
||||
|
||||
{$mode objfpc}
|
||||
|
||||
uses
|
||||
sherpa_onnx,
|
||||
SysUtils;
|
||||
|
||||
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
|
||||
var
|
||||
Config: TSherpaOnnxVadModelConfig;
|
||||
|
||||
SampleRate: Integer;
|
||||
WindowSize: Integer;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||
WindowSize := 512; {Please don't change it unless you know the details}
|
||||
|
||||
Config.SileroVad.Model := './silero_vad.onnx';
|
||||
Config.SileroVad.MinSpeechDuration := 0.5;
|
||||
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||
Config.SileroVad.Threshold := 0.5;
|
||||
Config.SileroVad.WindowSize := WindowSize;
|
||||
Config.NumThreads:= 1;
|
||||
Config.Debug:= True;
|
||||
Config.Provider:= 'cpu';
|
||||
Config.SampleRate := SampleRate;
|
||||
|
||||
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
|
||||
end;
|
||||
|
||||
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
|
||||
var
|
||||
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx';
|
||||
Config.ModelConfig.SenseVoice.Language := 'auto';
|
||||
Config.ModelConfig.SenseVoice.UseItn := False;
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
Config.ModelConfig.NumThreads := 1;
|
||||
Config.ModelConfig.Debug := False;
|
||||
|
||||
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||
end;
|
||||
|
||||
var
|
||||
Wave: TSherpaOnnxWave;
|
||||
|
||||
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||
|
||||
Offset: Integer;
|
||||
WindowSize: Integer;
|
||||
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||
|
||||
Start: Single;
|
||||
Duration: Single;
|
||||
|
||||
Stream: TSherpaOnnxOfflineStream;
|
||||
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||
begin
|
||||
Vad := CreateVad();
|
||||
Recognizer := CreateOfflineRecognizer();
|
||||
|
||||
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
|
||||
if Wave.SampleRate <> Vad.Config.SampleRate then
|
||||
begin
|
||||
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||
[Vad.Config.SampleRate, Wave.SampleRate]));
|
||||
|
||||
Exit;
|
||||
end;
|
||||
|
||||
WindowSize := Vad.Config.SileroVad.WindowSize;
|
||||
Offset := 0;
|
||||
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||
begin
|
||||
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||
Offset += WindowSize;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
end;
|
||||
|
||||
Vad.Flush;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
|
||||
FreeAndNil(Recognizer);
|
||||
FreeAndNil(Vad);
|
||||
end.
|
||||
@@ -0,0 +1,136 @@
|
||||
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||
|
||||
{
|
||||
This file shows how to use a non-streaming Whisper model
|
||||
with silero VAD to decode files.
|
||||
|
||||
You can download the model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
}
|
||||
|
||||
program vad_with_whisper;
|
||||
|
||||
{$mode objfpc}
|
||||
|
||||
uses
|
||||
sherpa_onnx,
|
||||
SysUtils;
|
||||
|
||||
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
|
||||
var
|
||||
Config: TSherpaOnnxVadModelConfig;
|
||||
|
||||
SampleRate: Integer;
|
||||
WindowSize: Integer;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||
WindowSize := 512; {Please don't change it unless you know the details}
|
||||
|
||||
Config.SileroVad.Model := './silero_vad.onnx';
|
||||
Config.SileroVad.MinSpeechDuration := 0.5;
|
||||
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||
Config.SileroVad.Threshold := 0.5;
|
||||
Config.SileroVad.WindowSize := WindowSize;
|
||||
Config.NumThreads:= 1;
|
||||
Config.Debug:= True;
|
||||
Config.Provider:= 'cpu';
|
||||
Config.SampleRate := SampleRate;
|
||||
|
||||
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
|
||||
end;
|
||||
|
||||
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
|
||||
var
|
||||
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx';
|
||||
Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
Config.ModelConfig.NumThreads := 1;
|
||||
Config.ModelConfig.Debug := False;
|
||||
|
||||
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||
end;
|
||||
|
||||
var
|
||||
Wave: TSherpaOnnxWave;
|
||||
|
||||
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||
|
||||
Offset: Integer;
|
||||
WindowSize: Integer;
|
||||
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||
|
||||
Start: Single;
|
||||
Duration: Single;
|
||||
|
||||
Stream: TSherpaOnnxOfflineStream;
|
||||
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||
begin
|
||||
Vad := CreateVad();
|
||||
Recognizer := CreateOfflineRecognizer();
|
||||
|
||||
Wave := SherpaOnnxReadWave('./Obama.wav');
|
||||
if Wave.SampleRate <> Vad.Config.SampleRate then
|
||||
begin
|
||||
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||
[Vad.Config.SampleRate, Wave.SampleRate]));
|
||||
|
||||
Exit;
|
||||
end;
|
||||
|
||||
WindowSize := Vad.Config.SileroVad.WindowSize;
|
||||
Offset := 0;
|
||||
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||
begin
|
||||
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||
Offset += WindowSize;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
end;
|
||||
|
||||
Vad.Flush;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
|
||||
FreeAndNil(Recognizer);
|
||||
FreeAndNil(Vad);
|
||||
end.
|
||||
3
pascal-api-examples/vad/.gitignore
vendored
Normal file
3
pascal-api-examples/vad/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
!run*.sh
|
||||
circular_buffer
|
||||
remove_silence
|
||||
11
pascal-api-examples/vad/README.md
Normal file
11
pascal-api-examples/vad/README.md
Normal file
@@ -0,0 +1,11 @@
|
||||
# Introduction
|
||||
|
||||
|
||||
This directory contains examples for how to use the VAD (voice activity detection)
|
||||
APIs.
|
||||
|
||||
|Directory| Description|
|
||||
|---------|------------|
|
||||
|[run-circular-buffer.sh](./run-circular-buffer.sh)|It shows how to use the circular buffer API.|
|
||||
|[run-remove-silence.sh](./run-remove-silence.sh)|It shows how to use the VAD API to remove silences from a wave file.|
|
||||
|
||||
106
pascal-api-examples/vad/circular_buffer.pas
Normal file
106
pascal-api-examples/vad/circular_buffer.pas
Normal file
@@ -0,0 +1,106 @@
|
||||
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||
program circular_buffer;
|
||||
{
|
||||
This file shows how to use the CircularBuffer API of sherpa-onnx
|
||||
}
|
||||
|
||||
{$mode objfpc}
|
||||
{$ASSERTIONS ON}
|
||||
|
||||
uses
|
||||
sherpa_onnx;
|
||||
|
||||
var
|
||||
Buffer: TSherpaOnnxCircularBuffer;
|
||||
Samples: TSherpaOnnxSamplesArray;
|
||||
begin
|
||||
{The initial capacity is 5. It will be resized automatically if needed.}
|
||||
Buffer := TSherpaOnnxCircularBuffer.Create(5);
|
||||
Assert(Buffer.Size = 0);
|
||||
Assert(Buffer.Head = 0);
|
||||
Buffer.Push([0, 10, 20]);
|
||||
|
||||
{Push() changes Size. Head is not changed.}
|
||||
Assert(Buffer.Size = 3);
|
||||
Assert(Buffer.Head = 0);
|
||||
|
||||
Samples := Buffer.Get(0, 1);
|
||||
Assert(Length(Samples) = 1);
|
||||
Assert(Samples[0] = 0);
|
||||
|
||||
{ Get() does not change Size or Head}
|
||||
Assert(Buffer.Size = 3);
|
||||
Assert(Buffer.Head = 0);
|
||||
|
||||
Samples := Buffer.Get(0, 2);
|
||||
Assert(Length(Samples) = 2);
|
||||
Assert(Samples[0] = 0);
|
||||
Assert(Samples[1] = 10);
|
||||
|
||||
{ The buffer will be resized since its initial capacity is 5 but we have
|
||||
pushed 7 elements into it.
|
||||
|
||||
No data is lost during the resize.
|
||||
}
|
||||
Buffer.Push([30, 40, 50, 60]);
|
||||
|
||||
Assert(Buffer.Size = 7); {There are now 7 elements}
|
||||
Assert(Buffer.Head = 0);
|
||||
|
||||
{Remove the first 4 elements}
|
||||
Buffer.Pop(4);
|
||||
|
||||
Assert(Buffer.Size = 3); {There are only 3 elements left}
|
||||
Assert(Buffer.Head = 4);
|
||||
|
||||
Samples := Buffer.Get(Buffer.Head, 2);
|
||||
Assert(Length(Samples) = 2);
|
||||
Assert(Samples[0] = 40);
|
||||
Assert(Samples[1] = 50);
|
||||
|
||||
Buffer.Pop(1);
|
||||
|
||||
Assert(Buffer.Size = 2); {There are only 2 elements left}
|
||||
Assert(Buffer.Head = 5);
|
||||
|
||||
Samples := Buffer.Get(Buffer.Head, 2);
|
||||
Assert(Length(Samples) = 2);
|
||||
Assert(Samples[0] = 50);
|
||||
Assert(Samples[1] = 60);
|
||||
|
||||
Buffer.Pop(2);
|
||||
Assert(Buffer.Size = 0); {There are no elements left}
|
||||
Assert(Buffer.Head = 7);
|
||||
|
||||
Buffer.Push([100, 200, 300, 400, 500]);
|
||||
Assert(Buffer.Size = 5);
|
||||
Assert(Buffer.Head = 7);
|
||||
|
||||
Buffer.Pop(4);
|
||||
Assert(Buffer.Size = 1);
|
||||
|
||||
{Head can be larger than the Capacity!
|
||||
This is what circular means. It points to Buffer.Head / Capacity.
|
||||
}
|
||||
Assert(Buffer.Head = 11);
|
||||
Buffer.Push([600, 700]);
|
||||
|
||||
Assert(Buffer.Size = 3);
|
||||
Assert(Buffer.Head = 11);
|
||||
|
||||
Samples := Buffer.Get(Buffer.Head, 3);
|
||||
Assert(Length(Samples) = 3);
|
||||
Assert(Samples[0] = 500);
|
||||
Assert(Samples[1] = 600);
|
||||
Assert(Samples[2] = 700);
|
||||
|
||||
Buffer.Pop(3);
|
||||
Assert(Buffer.Size = 0);
|
||||
Assert(Buffer.Head = 14);
|
||||
|
||||
Buffer.Reset();
|
||||
|
||||
Assert(Buffer.Size = 0);
|
||||
Assert(Buffer.Head = 0);
|
||||
end.
|
||||
|
||||
115
pascal-api-examples/vad/remove_silence.pas
Normal file
115
pascal-api-examples/vad/remove_silence.pas
Normal file
@@ -0,0 +1,115 @@
|
||||
{ Copyright (c) 2024 Xiaomi Corporation }
|
||||
{
|
||||
This file shows how to use the VAD API from sherpa-onnx
|
||||
to remove silences from a wave file.
|
||||
}
|
||||
program main;
|
||||
|
||||
{$mode delphi}
|
||||
|
||||
uses
|
||||
sherpa_onnx,
|
||||
SysUtils;
|
||||
|
||||
var
|
||||
Wave: TSherpaOnnxWave;
|
||||
|
||||
Config: TSherpaOnnxVadModelConfig;
|
||||
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||
Offset: Integer;
|
||||
WindowSize: Integer;
|
||||
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||
|
||||
Start: Single;
|
||||
Duration: Single;
|
||||
SampleRate: Integer;
|
||||
|
||||
AllSpeechSegment: array of TSherpaOnnxSpeechSegment;
|
||||
AllSamples: array of Single;
|
||||
N: Integer;
|
||||
I: Integer;
|
||||
begin
|
||||
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||
|
||||
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
|
||||
if Wave.SampleRate <> SampleRate then
|
||||
begin
|
||||
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||
[SampleRate, Wave.SampleRate]));
|
||||
|
||||
Exit;
|
||||
end;
|
||||
|
||||
WindowSize := 512; {Please don't change it unless you know the details}
|
||||
Initialize(Config);
|
||||
|
||||
Config.SileroVad.Model := './silero_vad.onnx';
|
||||
Config.SileroVad.MinSpeechDuration := 0.25;
|
||||
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||
Config.SileroVad.Threshold := 0.5;
|
||||
Config.SileroVad.WindowSize := WindowSize;
|
||||
Config.NumThreads:= 1;
|
||||
Config.Debug:= True;
|
||||
Config.Provider:= 'cpu';
|
||||
Config.SampleRate := SampleRate;
|
||||
|
||||
Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20);
|
||||
|
||||
AllSpeechSegment := nil;
|
||||
AllSamples := nil;
|
||||
Offset := 0;
|
||||
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||
begin
|
||||
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||
Inc(Offset, WindowSize);
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
|
||||
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
|
||||
|
||||
Start := SpeechSegment.Start / SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
|
||||
end;
|
||||
end;
|
||||
|
||||
Vad.Flush;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1);
|
||||
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment;
|
||||
|
||||
Start := SpeechSegment.Start / SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration]));
|
||||
end;
|
||||
|
||||
N := 0;
|
||||
for SpeechSegment in AllSpeechSegment do
|
||||
Inc(N, Length(SpeechSegment.Samples));
|
||||
|
||||
SetLength(AllSamples, N);
|
||||
|
||||
N := 0;
|
||||
for SpeechSegment in AllSpeechSegment do
|
||||
begin
|
||||
for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do
|
||||
begin
|
||||
AllSamples[N] := SpeechSegment.Samples[I];
|
||||
Inc(N);
|
||||
end;
|
||||
end;
|
||||
|
||||
SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate);
|
||||
WriteLn('Saved to ./lei-jun-test-no-silence.wav');
|
||||
|
||||
FreeAndNil(Vad);
|
||||
end.
|
||||
34
pascal-api-examples/vad/run-circular-buffer.sh
Executable file
34
pascal-api-examples/vad/run-circular-buffer.sh
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
popd
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./circular_buffer.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./circular_buffer
|
||||
42
pascal-api-examples/vad/run-remove-silence.sh
Executable file
42
pascal-api-examples/vad/run-remove-silence.sh
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
popd
|
||||
fi
|
||||
|
||||
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
fi
|
||||
|
||||
if [ ! -f ./lei-jun-test.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./remove_silence.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./remove_silence
|
||||
Reference in New Issue
Block a user