Support non-streaming zipformer CTC ASR models (#2340)

This PR adds support for non-streaming Zipformer CTC ASR models across 
multiple language bindings, WebAssembly, examples, and CI workflows.

- Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs
- Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js
- Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models

Model doc is available at
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
This commit is contained in:
Fangjun Kuang
2025-07-04 15:57:07 +08:00
committed by GitHub
parent ef16455cb5
commit 3bf986d08d
71 changed files with 2121 additions and 68 deletions

View File

@@ -9,3 +9,4 @@ sense_voice
telespeech_ctc
moonshine
dolphin_ctc
zipformer_ctc

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
ls -lh lib
popd
fi
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./zipformer_ctc.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./zipformer_ctc

View File

@@ -0,0 +1,76 @@
{ Copyright (c) 2025 Xiaomi Corporation }
{
This file shows how to use a non-streaming Zipformer CTC model
to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program zipformer_ctc;
{$mode objfpc}
uses
sherpa_onnx,
DateUtils,
SysUtils;
var
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;
Config: TSherpaOnnxOfflineRecognizerConfig;
Recognizer: TSherpaOnnxOfflineRecognizer;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
Start: TDateTime;
Stop: TDateTime;
Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
WaveFilename := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
Wave := SherpaOnnxReadWave(WaveFilename);
Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
Stream := Recognizer.CreateStream();
Start := Now;
Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Stop := Now;
Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;
WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
{Free resources to avoid memory leak.
Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.

View File

@@ -2,3 +2,5 @@
vad_with_whisper
vad_with_sense_voice
vad_with_moonshine
vad_with_zipformer_ctc
vad_with_dolphin

View File

@@ -0,0 +1,50 @@
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
popd
fi
if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi
if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./vad_with_zipformer_ctc.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./vad_with_zipformer_ctc

View File

@@ -0,0 +1,135 @@
{ Copyright (c) 2025 Xiaomi Corporation }
{
This file shows how to use a non-streaming Zipformer CTC model
with silero VAD to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program vad_with_zipformer_ctc;
{$mode objfpc}
uses
sherpa_onnx,
SysUtils;
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
var
Config: TSherpaOnnxVadModelConfig;
SampleRate: Integer;
WindowSize: Integer;
begin
Initialize(Config);
SampleRate := 16000; {Please don't change it unless you know the details}
WindowSize := 512; {Please don't change it unless you know the details}
Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
end;
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
var
Wave: TSherpaOnnxWave;
Recognizer: TSherpaOnnxOfflineRecognizer;
Vad: TSherpaOnnxVoiceActivityDetector;
Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Start: Single;
Duration: Single;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
begin
Vad := CreateVad();
Recognizer := CreateOfflineRecognizer();
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> Vad.Config.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[Vad.Config.SampleRate, Wave.SampleRate]));
Exit;
end;
WindowSize := Vad.Config.SileroVad.WindowSize;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Offset += WindowSize;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
end;
Vad.Flush;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
FreeAndNil(Recognizer);
FreeAndNil(Vad);
end.