diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index d44f0f4e..807fa8ca 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -149,6 +149,11 @@ jobs: cd ./pascal-api-examples pushd non-streaming-asr + + ./run-dolphin-ctc.sh + rm -rf sherpa-onnx-* + echo "---" + ./run-zipformer-transducer.sh rm -rf sherpa-onnx-* echo "---" @@ -253,7 +258,13 @@ jobs: cd ./pascal-api-examples + pushd vad-with-non-streaming-asr + + time ./run-vad-with-dolphin-ctc.sh + rm -rf sherpa-onnx-* + echo "---" + time ./run-vad-with-moonshine.sh rm -rf sherpa-onnx-* echo "---" diff --git a/README.md b/README.md index c7e67216..b36d54c2 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ This repository supports running the following functions **locally** on the following platforms and operating systems: - - x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64) + - x86, ``x86_64``, 32-bit ARM, 64-bit ARM (arm64, aarch64), RISC-V (riscv64), **RK NPU** - Linux, macOS, Windows, openKylin - Android, WearOS - iOS diff --git a/pascal-api-examples/non-streaming-asr/README.md b/pascal-api-examples/non-streaming-asr/README.md index f8d35c3a..6a066e24 100644 --- a/pascal-api-examples/non-streaming-asr/README.md +++ b/pascal-api-examples/non-streaming-asr/README.md @@ -5,6 +5,7 @@ APIs with non-streaming models for speech recognition. |File|Description| |----|-----------| +|[run-dolphin-ctc.sh](./run-dolphin-ctc.sh)|Use a non-streaming [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model for speech recognition| |[run-nemo-ctc.sh](./run-nemo-ctc.sh)|Use a non-streaming NeMo CTC model for speech recognition| |[run-nemo-transducer.sh](./run-nemo-transducer.sh)|Use a non-streaming NeMo transducer model for speech recognition| |[run-paraformer-itn.sh](./run-paraformer-itn.sh)|Use a non-streaming Paraformer model for speech recognition with inverse text normalization for numbers| diff --git a/pascal-api-examples/non-streaming-asr/dolphin_ctc.pas b/pascal-api-examples/non-streaming-asr/dolphin_ctc.pas new file mode 100644 index 00000000..99511602 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/dolphin_ctc.pas @@ -0,0 +1,76 @@ +{ Copyright (c) 2025 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Dolphin CTC model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program dolphin_ctc; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Initialize(Config); + + Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/non-streaming-asr/run-dolphin-ctc.sh b/pascal-api-examples/non-streaming-asr/run-dolphin-ctc.sh new file mode 100755 index 00000000..5b6040bb --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-dolphin-ctc.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./dolphin_ctc.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./dolphin_ctc diff --git a/pascal-api-examples/vad-with-non-streaming-asr/README.md b/pascal-api-examples/vad-with-non-streaming-asr/README.md index 220a55d6..4e53dee0 100644 --- a/pascal-api-examples/vad-with-non-streaming-asr/README.md +++ b/pascal-api-examples/vad-with-non-streaming-asr/README.md @@ -6,7 +6,10 @@ with non-streaming speech recognition models. |Directory| Description| |---------|------------| -|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.| -|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.| +|[run-vad-with-dolphin-ctc.sh](./run-vad-with-dolphin-ctc.sh)|It shows how to use the VAD + [Dolphin](https://github.com/DataoceanAI/Dolphin) for speech recognition.| +|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + [Whisper](https://github.com/openai/whisper) for speech recognition.| +|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) for speech recognition.| +|[run-vad-with-moonshine.sh](./run-vad-with-moonshine.sh)|It shows how to use the VAD + [Moonshine](https://github.com/usefulsensors/moonshine) for speech recognition.| + Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models. diff --git a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-dolphin-ctc.sh b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-dolphin-ctc.sh new file mode 100755 index 00000000..63d76061 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-dolphin-ctc.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 + rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./vad_with_dolphin.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./vad_with_dolphin diff --git a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_dolphin.pas b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_dolphin.pas new file mode 100644 index 00000000..7ea2e4e6 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_dolphin.pas @@ -0,0 +1,135 @@ +{ Copyright (c) 2025 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Dolphin model +with silero VAD to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program vad_with_dolphin; + +{$mode objfpc} + +uses + sherpa_onnx, + SysUtils; + +function CreateVad(): TSherpaOnnxVoiceActivityDetector; +var + Config: TSherpaOnnxVadModelConfig; + + SampleRate: Integer; + WindowSize: Integer; +begin + Initialize(Config); + + SampleRate := 16000; {Please don't change it unless you know the details} + WindowSize := 512; {Please don't change it unless you know the details} + + Config.SileroVad.Model := './silero_vad.onnx'; + Config.SileroVad.MinSpeechDuration := 0.5; + Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.Threshold := 0.5; + Config.SileroVad.WindowSize := WindowSize; + Config.NumThreads:= 1; + Config.Debug:= True; + Config.Provider:= 'cpu'; + Config.SampleRate := SampleRate; + + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30); +end; + +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer; +var + Config: TSherpaOnnxOfflineRecognizerConfig; +begin + Initialize(Config); + + Config.ModelConfig.Dolphin.Model := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + Result := TSherpaOnnxOfflineRecognizer.Create(Config); +end; + +var + Wave: TSherpaOnnxWave; + + Recognizer: TSherpaOnnxOfflineRecognizer; + Vad: TSherpaOnnxVoiceActivityDetector; + + Offset: Integer; + WindowSize: Integer; + SpeechSegment: TSherpaOnnxSpeechSegment; + + Start: Single; + Duration: Single; + + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; +begin + Vad := CreateVad(); + Recognizer := CreateOfflineRecognizer(); + + Wave := SherpaOnnxReadWave('./lei-jun-test.wav'); + if Wave.SampleRate <> Vad.Config.SampleRate then + begin + WriteLn(Format('Expected sample rate: %d. Given: %d', + [Vad.Config.SampleRate, Wave.SampleRate])); + + Exit; + end; + + WindowSize := Vad.Config.SileroVad.WindowSize; + Offset := 0; + while Offset + WindowSize <= Length(Wave.Samples) do + begin + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); + Offset += WindowSize; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + end; + + Vad.Flush; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + + FreeAndNil(Recognizer); + FreeAndNil(Vad); +end. diff --git a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas index fff484db..ec2e53c4 100644 --- a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas +++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas @@ -8,7 +8,7 @@ You can download the model files from https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models } -program vad_with_whisper; +program vad_with_sense_voice; {$mode objfpc} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 580c9e80..eb370b78 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1969,7 +1969,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( return p->impl->GetOutputSamplingRate(); } -void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { +void SherpaOnnxLinearResamplerReset(const SherpaOnnxLinearResampler *p) { p->impl->Reset(); } diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index e2ad34e8..db6f7992 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -270,6 +270,11 @@ type function ToString: AnsiString; end; + TSherpaOnnxOfflineDolphinModelConfig = record + Model: AnsiString; + function ToString: AnsiString; + end; + TSherpaOnnxOfflineWhisperModelConfig = record Encoder: AnsiString; Decoder: AnsiString; @@ -331,6 +336,7 @@ type SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig; Moonshine: TSherpaOnnxOfflineMoonshineModelConfig; FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig; + Dolphin: TSherpaOnnxOfflineDolphinModelConfig; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); function ToString: AnsiString; end; @@ -694,6 +700,9 @@ type SherpaOnnxOfflineNemoEncDecCtcModelConfig = record Model: PAnsiChar; end; + SherpaOnnxOfflineDolphinModelConfig = record + Model: PAnsiChar; + end; SherpaOnnxOfflineWhisperModelConfig = record Encoder: PAnsiChar; Decoder: PAnsiChar; @@ -740,6 +749,7 @@ type SenseVoice: SherpaOnnxOfflineSenseVoiceModelConfig; Moonshine: SherpaOnnxOfflineMoonshineModelConfig; FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig; + Dolphin: SherpaOnnxOfflineDolphinModelConfig; end; SherpaOnnxOfflineRecognizerConfig = record @@ -1461,6 +1471,12 @@ begin [Self.Model]); end; +function TSherpaOnnxOfflineDolphinModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineDolphinModelConfig(Model := %s)', + [Self.Model]); +end; + function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; begin Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' + @@ -1534,14 +1550,15 @@ begin 'TeleSpeechCtc := %s, ' + 'SenseVoice := %s, ' + 'Moonshine := %s, ' + - 'FireRedAsr := %s' + + 'FireRedAsr := %s, ' + + 'Dolphin := %s' + ')', [Self.Transducer.ToString, Self.Paraformer.ToString, Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, Self.ModelType, Self.ModelingUnit, Self.BpeVocab, Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString, - Self.FireRedAsr.ToString + Self.FireRedAsr.ToString, Self.Dolphin.ToString ]); end; @@ -1610,6 +1627,8 @@ begin C.ModelConfig.FireRedAsr.Encoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Encoder); C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder); + C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model); + C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); C.LMConfig.Scale := Config.LMConfig.Scale;