Support non-streaming zipformer CTC ASR models (#2340)

This PR adds support for non-streaming Zipformer CTC ASR models across 
multiple language bindings, WebAssembly, examples, and CI workflows.

- Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs
- Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js
- Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models

Model doc is available at
https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
This commit is contained in:
Fangjun Kuang
2025-07-04 15:57:07 +08:00
committed by GitHub
parent ef16455cb5
commit 3bf986d08d
71 changed files with 2121 additions and 68 deletions

View File

@@ -6,6 +6,10 @@ cd dart-api-examples
pushd non-streaming-asr pushd non-streaming-asr
echo '----------Zipformer CTC----------'
./run-zipformer-ctc.sh
rm -rf sherpa-onnx-*
echo '----------SenseVoice----------' echo '----------SenseVoice----------'
./run-sense-voice-with-hr.sh ./run-sense-voice-with-hr.sh
./run-sense-voice.sh ./run-sense-voice.sh
@@ -114,6 +118,10 @@ popd
pushd vad-with-non-streaming-asr pushd vad-with-non-streaming-asr
echo '----------Zipformer CTC----------'
./run-zipformer-ctc.sh
rm -rf sherpa-onnx-*
echo '----------Dolphin CTC----------' echo '----------Dolphin CTC----------'
./run-dolphin-ctc.sh ./run-dolphin-ctc.sh
rm -rf sherpa-onnx-* rm -rf sherpa-onnx-*

View File

@@ -6,43 +6,11 @@ cd ./version-test
./run.sh ./run.sh
ls -lh ls -lh
cd ../speech-enhancement-gtcrn
./run.sh
ls -lh
cd ../kokoro-tts
./run-kokoro.sh
ls -lh
cd ../offline-tts
./run-matcha-zh.sh
ls -lh *.wav
./run-matcha-en.sh
ls -lh *.wav
./run-aishell3.sh
ls -lh *.wav
./run-piper.sh
ls -lh *.wav
./run-hf-fanchen.sh
ls -lh *.wav
ls -lh
pushd ../..
mkdir tts
cp -v dotnet-examples/kokoro-tts/*.wav ./tts
cp -v dotnet-examples/offline-tts/*.wav ./tts
popd
cd ../offline-speaker-diarization
./run.sh
rm -rfv *.onnx
rm -fv *.wav
rm -rfv sherpa-onnx-pyannote-*
cd ../offline-decode-files cd ../offline-decode-files
./run-zipformer-ctc.sh
rm -rf sherpa-onnx-*
./run-dolphin-ctc.sh ./run-dolphin-ctc.sh
rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
@@ -82,6 +50,41 @@ rm -rf sherpa-onnx-*
./run-tdnn-yesno.sh ./run-tdnn-yesno.sh
rm -rf sherpa-onnx-* rm -rf sherpa-onnx-*
cd ../speech-enhancement-gtcrn
./run.sh
ls -lh
cd ../kokoro-tts
./run-kokoro.sh
ls -lh
cd ../offline-tts
./run-matcha-zh.sh
ls -lh *.wav
./run-matcha-en.sh
ls -lh *.wav
./run-aishell3.sh
ls -lh *.wav
./run-piper.sh
ls -lh *.wav
./run-hf-fanchen.sh
ls -lh *.wav
ls -lh
pushd ../..
mkdir tts
cp -v dotnet-examples/kokoro-tts/*.wav ./tts
cp -v dotnet-examples/offline-tts/*.wav ./tts
popd
cd ../offline-speaker-diarization
./run.sh
rm -rfv *.onnx
rm -fv *.wav
rm -rfv sherpa-onnx-pyannote-*
cd ../keyword-spotting-from-files cd ../keyword-spotting-from-files
./run.sh ./run.sh
@@ -115,5 +118,3 @@ rm -rf sherpa-onnx-*
cd ../spoken-language-identification cd ../spoken-language-identification
./run.sh ./run.sh
rm -rf sherpa-onnx-* rm -rf sherpa-onnx-*

View File

@@ -10,6 +10,15 @@ arch=$(node -p "require('os').arch()")
platform=$(node -p "require('os').platform()") platform=$(node -p "require('os').platform()")
node_version=$(node -p "process.versions.node.split('.')[0]") node_version=$(node -p "process.versions.node.split('.')[0]")
echo "----------non-streaming ASR Zipformer CTC----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
node ./test_asr_non_streaming_zipformer_ctc.js
rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
echo "----------non-streaming ASR NeMo parakeet tdt----------" echo "----------non-streaming ASR NeMo parakeet tdt----------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2

View File

@@ -9,6 +9,15 @@ git status
ls -lh ls -lh
ls -lh node_modules ls -lh node_modules
# asr with offline zipformer ctc
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
node ./test-offline-zipformer-ctc.js
rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
# asr with offline dolphin ctc # asr with offline dolphin ctc
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2

View File

@@ -9,6 +9,9 @@ ls -lh
./run-test-version.sh ./run-test-version.sh
./run-zipformer-ctc-asr.sh
rm -rf sherpa-onnx-zipformer-*
./run-decode-file-sense-voice-with-hr.sh ./run-decode-file-sense-voice-with-hr.sh
rm -rf sherpa-onnx-sense-voice-* rm -rf sherpa-onnx-sense-voice-*
rm -rf dict lexicon.txt replace.fst test-hr.wav rm -rf dict lexicon.txt replace.fst test-hr.wav

View File

@@ -89,6 +89,7 @@ jobs:
make -j4 install make -j4 install
cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
rm -rf install/lib/pkgconfig rm -rf install/lib/pkgconfig
rm -fv install/lib/cargs.h rm -fv install/lib/cargs.h
@@ -135,6 +136,7 @@ jobs:
make -j4 install make -j4 install
cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
rm -rf install/lib/pkgconfig rm -rf install/lib/pkgconfig
rm -fv install/lib/cargs.h rm -fv install/lib/cargs.h

View File

@@ -90,6 +90,7 @@ jobs:
make install make install
cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
cp bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
ls -lh install/lib ls -lh install/lib

View File

@@ -37,7 +37,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest, macos-latest, macos-13, windows-latest] os: [ubuntu-latest, macos-latest, macos-13, windows-latest, ubuntu-22.04-arm]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -56,7 +56,7 @@ jobs:
key: ${{ matrix.os }} key: ${{ matrix.os }}
- name: Install Free pascal compiler (ubuntu) - name: Install Free pascal compiler (ubuntu)
if: matrix.os == 'ubuntu-latest' if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-22.04-arm'
shell: bash shell: bash
run: | run: |
sudo apt-get update sudo apt-get update
@@ -156,6 +156,10 @@ jobs:
pushd non-streaming-asr pushd non-streaming-asr
./run-zipformer-ctc.sh
rm -rf sherpa-onnx-*
echo "---"
./run-dolphin-ctc.sh ./run-dolphin-ctc.sh
rm -rf sherpa-onnx-* rm -rf sherpa-onnx-*
echo "---" echo "---"
@@ -264,9 +268,12 @@ jobs:
cd ./pascal-api-examples cd ./pascal-api-examples
pushd vad-with-non-streaming-asr pushd vad-with-non-streaming-asr
time ./run-vad-with-zipformer-ctc.sh
rm -rf sherpa-onnx-*
echo "---"
time ./run-vad-with-dolphin-ctc.sh time ./run-vad-with-dolphin-ctc.sh
rm -rf sherpa-onnx-* rm -rf sherpa-onnx-*
echo "---" echo "---"

View File

@@ -165,6 +165,9 @@ jobs:
run: | run: |
cd ./java-api-examples cd ./java-api-examples
./run-non-streaming-decode-file-zipformer-ctc.sh
rm -rf sherpa-onnx-zipformer-ctc-*
./run-non-streaming-decode-file-dolphin-ctc.sh ./run-non-streaming-decode-file-dolphin-ctc.sh
rm -rf sherpa-onnx-dolphin-* rm -rf sherpa-onnx-dolphin-*

View File

@@ -184,6 +184,10 @@ jobs:
go build go build
ls -lh ls -lh
echo "Test Zipformer CTC"
./run-zipformer-ctc.sh
rm -rf sherpa-onnx-zipformer-*
echo "Test SenseVoice ctc" echo "Test SenseVoice ctc"
./run-sense-voice-small-with-hr.sh ./run-sense-voice-small-with-hr.sh
./run-sense-voice-small.sh ./run-sense-voice-small.sh

View File

@@ -19,12 +19,36 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
python-version: ["3.8"] python-version: ["3.10"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Zipformer CTC (non-streaming)
shell: bash
run: |
git lfs install
names=(
sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
sherpa-onnx-zipformer-ctc-zh-2025-07-03
sherpa-onnx-zipformer-ctc-zh-fp16-2025-07-03
)
for name in ${names[@]}; do
git clone https://huggingface.co/csukuangfj/$name
pushd $name
git lfs pull
rm -rf .git
rm -rfv .gitattributes
ls -lh
popd
tar cjfv $name.tar.bz2 $name
rm -rf $name
ls -lh *.tar.bz2
done
- name: Vietnamese (zipformer) - name: Vietnamese (zipformer)
if: false
shell: bash shell: bash
run: | run: |
rm -rf models rm -rf models
@@ -76,6 +100,7 @@ jobs:
mv models/* . mv models/* .
- name: Publish to huggingface (Vietnamese zipformer) - name: Publish to huggingface (Vietnamese zipformer)
if: false
env: env:
HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3 uses: nick-fields/retry@v3

View File

@@ -114,6 +114,7 @@ We also have spaces built using WebAssembly. They are listed below:
|Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| |Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]|
|Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| |Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]|
|Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| |Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]|
|VAD + speech recognition (Chinese) with [Zipformer CTC](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|[Click me][wasm-hf-vad-asr-zh-zipformer-ctc-07-03]| [地址][wasm-ms-vad-asr-zh-zipformer-ctc-07-03]|
|VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| |VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]|
|VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| |VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]|
|VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]| |VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]|
@@ -141,6 +142,7 @@ We also have spaces built using WebAssembly. They are listed below:
|----------------------------------------|------------------------------------|-----------------------------------| |----------------------------------------|------------------------------------|-----------------------------------|
| Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]| | Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]|
| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] | | Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] |
| Simulated-streaming speech recognition | [Address][apk-simula-streaming-asr]| [点此][apk-simula-streaming-asr-cn]|
| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | | Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] |
| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | | Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] |
| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | | VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] |
@@ -250,8 +252,10 @@ for more models. The following table lists only **SOME** of them.
|Name | Supported Languages| Description| |Name | Supported Languages| Description|
|-----|-----|----| |-----|-----|----|
|[sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english)| English | It is converted from <https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2>|
|[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)| |[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)|
|[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)| |[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)|
|[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|Chinese| A Zipformer CTC model|
|[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)| |[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)|
|[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)| |[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)|
|[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)| |[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)|
@@ -413,6 +417,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
[wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en [wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en
[wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en [wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en
[SenseVoice]: https://github.com/FunAudioLLM/SenseVoice [SenseVoice]: https://github.com/FunAudioLLM/SenseVoice
[wasm-hf-vad-asr-zh-zipformer-ctc-07-03]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc
[wasm-ms-vad-asr-zh-zipformer-ctc-07-03]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc/summary
[wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice [wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice
[wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice [wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice
[wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny [wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny
@@ -423,20 +429,20 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
[wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech [wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech
[wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech [wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
[wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech [wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf [reazonspeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
[wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer [wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
[wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer [wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2 [gigaspeech2]: https://github.com/speechcolab/gigaspeech2
[wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer [wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer
[wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer [wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer
[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR [telespeech-asr]: https://github.com/tele-ai/telespeech-asr
[wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech [wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
[wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech [wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
[wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer [wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
[wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
[wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
[wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
[Dolphin]: https://github.com/DataoceanAI/Dolphin [dolphin]: https://github.com/dataoceanai/dolphin
[wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc [wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
[wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc [wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
@@ -450,6 +456,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
[apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html [apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html
[apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html
[apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html
[apk-simula-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr.html
[apk-simula-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr-cn.html
[apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html
[apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html [apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html
[apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html [apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html

View File

@@ -45,6 +45,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
sherpa-onnx-cxx-api sherpa-onnx-cxx-api
portaudio_static portaudio_static
) )
add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api
./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
)
target_link_libraries(zipformer-ctc-simulate-streaming-microphone-cxx-api
sherpa-onnx-cxx-api
portaudio_static
)
endif() endif()
if(SHERPA_ONNX_HAS_ALSA) if(SHERPA_ONNX_HAS_ALSA)
@@ -57,10 +66,21 @@ if(SHERPA_ONNX_HAS_ALSA)
portaudio_static portaudio_static
) )
add_executable(zipformer-ctc-simulate-streaming-alsa-cxx-api
./zipformer-ctc-simulate-streaming-alsa-cxx-api.cc
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc
)
target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api
sherpa-onnx-cxx-api
portaudio_static
)
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
else() else()
target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound) target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound)
target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api asound)
endif() endif()
endif() endif()

View File

@@ -0,0 +1,240 @@
// cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc
// Copyright (c) 2025 Xiaomi Corporation
//
// This file demonstrates how to use zipformer CTC with sherpa-onnx's C++ API
// for streaming speech recognition from a microphone.
//
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
//
// clang-format on
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <iostream>
#include <mutex> // NOLINT
#include <queue>
#include <thread> // NOLINT
#include <vector>
#include "sherpa-display.h" // NOLINT
#include "sherpa-onnx/c-api/cxx-api.h"
#include "sherpa-onnx/csrc/alsa.h"
std::queue<std::vector<float>> samples_queue;
std::condition_variable condition_variable;
std::mutex mutex;
bool stop = false;
static void Handler(int32_t /*sig*/) {
stop = true;
condition_variable.notify_one();
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
}
static void RecordCallback(sherpa_onnx::Alsa *alsa) {
int32_t chunk = 0.1 * alsa->GetActualSampleRate();
while (!stop) {
std::vector<float> samples = alsa->Read(chunk);
std::lock_guard<std::mutex> lock(mutex);
samples_queue.emplace(std::move(samples));
condition_variable.notify_one();
}
}
static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
using namespace sherpa_onnx::cxx; // NOLINT
VadModelConfig config;
config.silero_vad.model = "./silero_vad.onnx";
config.silero_vad.threshold = 0.5;
config.silero_vad.min_silence_duration = 0.1;
config.silero_vad.min_speech_duration = 0.25;
config.silero_vad.max_speech_duration = 8;
config.sample_rate = 16000;
config.debug = false;
VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
if (!vad.Get()) {
std::cerr << "Failed to create VAD. Please check your config\n";
exit(-1);
}
return vad;
}
static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
using namespace sherpa_onnx::cxx; // NOLINT
OfflineRecognizerConfig config;
config.model_config.zipformer_ctc.model =
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
config.model_config.tokens =
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
config.model_config.num_threads = 2;
config.model_config.debug = false;
std::cout << "Loading model\n";
OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
if (!recognizer.Get()) {
std::cerr << "Please check your config\n";
exit(-1);
}
std::cout << "Loading model done\n";
return recognizer;
}
int32_t main(int32_t argc, const char *argv[]) {
const char *kUsageMessage = R"usage(
Usage:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
./zipformer-ctc-simulate-streaming-alsa-cxx-api device_name
The device name specifies which microphone to use in case there are several
on your system. You can use
arecord -l
to find all available microphones on your computer. For instance, if it outputs
**** List of CAPTURE Hardware Devices ****
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
Subdevices: 1/1
Subdevice #0: subdevice #0
and if you want to select card 3 and device 0 on that card, please use:
plughw:3,0
as the device_name.
)usage";
if (argc != 2) {
fprintf(stderr, "%s\n", kUsageMessage);
return -1;
}
signal(SIGINT, Handler);
using namespace sherpa_onnx::cxx; // NOLINT
auto vad = CreateVad();
auto recognizer = CreateOfflineRecognizer();
int32_t expected_sample_rate = 16000;
std::string device_name = argv[1];
sherpa_onnx::Alsa alsa(device_name.c_str());
fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
expected_sample_rate);
exit(-1);
}
int32_t window_size = 512; // samples, please don't change
int32_t offset = 0;
std::vector<float> buffer;
bool speech_started = false;
auto started_time = std::chrono::steady_clock::now();
SherpaDisplay display;
std::thread record_thread(RecordCallback, &alsa);
std::cout << "Started! Please speak\n";
while (!stop) {
{
std::unique_lock<std::mutex> lock(mutex);
while (samples_queue.empty() && !stop) {
condition_variable.wait(lock);
}
const auto &s = samples_queue.front();
buffer.insert(buffer.end(), s.begin(), s.end());
samples_queue.pop();
}
for (; offset + window_size < buffer.size(); offset += window_size) {
vad.AcceptWaveform(buffer.data() + offset, window_size);
if (!speech_started && vad.IsDetected()) {
speech_started = true;
started_time = std::chrono::steady_clock::now();
}
}
if (!speech_started) {
if (buffer.size() > 10 * window_size) {
offset -= buffer.size() - 10 * window_size;
buffer = {buffer.end() - 10 * window_size, buffer.end()};
}
}
auto current_time = std::chrono::steady_clock::now();
const float elapsed_seconds =
std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
started_time)
.count() /
1000.;
if (speech_started && elapsed_seconds > 0.2) {
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size());
recognizer.Decode(&stream);
OfflineRecognizerResult result = recognizer.GetResult(&stream);
display.UpdateText(result.text);
display.Display();
started_time = std::chrono::steady_clock::now();
}
while (!vad.IsEmpty()) {
auto segment = vad.Front();
vad.Pop();
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(expected_sample_rate, segment.samples.data(),
segment.samples.size());
recognizer.Decode(&stream);
OfflineRecognizerResult result = recognizer.GetResult(&stream);
display.UpdateText(result.text);
display.FinalizeCurrentSentence();
display.Display();
buffer.clear();
offset = 0;
speech_started = false;
}
}
record_thread.join();
return 0;
}

View File

@@ -0,0 +1,237 @@
// cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
// Copyright (c) 2025 Xiaomi Corporation
//
// This file demonstrates how to use Zipformer CTC with sherpa-onnx's C++ API
// for streaming speech recognition from a microphone.
//
// clang-format off
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
//
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
//
// clang-format on
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <iostream>
#include <mutex> // NOLINT
#include <queue>
#include <vector>
#include "portaudio.h" // NOLINT
#include "sherpa-display.h" // NOLINT
#include "sherpa-onnx/c-api/cxx-api.h"
#include "sherpa-onnx/csrc/microphone.h"
std::queue<std::vector<float>> samples_queue;
std::condition_variable condition_variable;
std::mutex mutex;
bool stop = false;
static void Handler(int32_t /*sig*/) {
stop = true;
condition_variable.notify_one();
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
}
static int32_t RecordCallback(const void *input_buffer,
void * /*output_buffer*/,
unsigned long frames_per_buffer, // NOLINT
const PaStreamCallbackTimeInfo * /*time_info*/,
PaStreamCallbackFlags /*status_flags*/,
void * /*user_data*/) {
std::lock_guard<std::mutex> lock(mutex);
samples_queue.emplace(
reinterpret_cast<const float *>(input_buffer),
reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
condition_variable.notify_one();
return stop ? paComplete : paContinue;
}
static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
using namespace sherpa_onnx::cxx; // NOLINT
VadModelConfig config;
config.silero_vad.model = "./silero_vad.onnx";
config.silero_vad.threshold = 0.5;
config.silero_vad.min_silence_duration = 0.1;
config.silero_vad.min_speech_duration = 0.25;
config.silero_vad.max_speech_duration = 8;
config.sample_rate = 16000;
config.debug = false;
VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
if (!vad.Get()) {
std::cerr << "Failed to create VAD. Please check your config\n";
exit(-1);
}
return vad;
}
static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
using namespace sherpa_onnx::cxx; // NOLINT
OfflineRecognizerConfig config;
config.model_config.zipformer_ctc.model =
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
config.model_config.tokens =
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
config.model_config.num_threads = 2;
config.model_config.debug = false;
std::cout << "Loading model\n";
OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
if (!recognizer.Get()) {
std::cerr << "Please check your config\n";
exit(-1);
}
std::cout << "Loading model done\n";
return recognizer;
}
int32_t main() {
signal(SIGINT, Handler);
using namespace sherpa_onnx::cxx; // NOLINT
auto vad = CreateVad();
auto recognizer = CreateOfflineRecognizer();
sherpa_onnx::Microphone mic;
PaDeviceIndex num_devices = Pa_GetDeviceCount();
if (num_devices == 0) {
std::cerr << " If you are using Linux, please try "
"./build/bin/zipformer-ctc-simulate-streaming-alsa-cxx-api\n";
return -1;
}
int32_t device_index = Pa_GetDefaultInputDevice();
const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
if (pDeviceIndex) {
fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
device_index = atoi(pDeviceIndex);
}
mic.PrintDevices(device_index);
float mic_sample_rate = 16000;
const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
if (sample_rate_str) {
fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
mic_sample_rate = atof(sample_rate_str);
}
float sample_rate = 16000;
LinearResampler resampler;
if (mic_sample_rate != sample_rate) {
float min_freq = std::min(mic_sample_rate, sample_rate);
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
int32_t lowpass_filter_width = 6;
resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
lowpass_cutoff, lowpass_filter_width);
}
if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
nullptr) == false) {
std::cerr << "Failed to open microphone device\n";
return -1;
}
int32_t window_size = 512; // samples, please don't change
int32_t offset = 0;
std::vector<float> buffer;
bool speech_started = false;
auto started_time = std::chrono::steady_clock::now();
SherpaDisplay display;
std::cout << "Started! Please speak\n";
while (!stop) {
{
std::unique_lock<std::mutex> lock(mutex);
while (samples_queue.empty() && !stop) {
condition_variable.wait(lock);
}
const auto &s = samples_queue.front();
if (!resampler.Get()) {
buffer.insert(buffer.end(), s.begin(), s.end());
} else {
auto resampled = resampler.Resample(s.data(), s.size(), false);
buffer.insert(buffer.end(), resampled.begin(), resampled.end());
}
samples_queue.pop();
}
for (; offset + window_size < buffer.size(); offset += window_size) {
vad.AcceptWaveform(buffer.data() + offset, window_size);
if (!speech_started && vad.IsDetected()) {
speech_started = true;
started_time = std::chrono::steady_clock::now();
}
}
if (!speech_started) {
if (buffer.size() > 10 * window_size) {
offset -= buffer.size() - 10 * window_size;
buffer = {buffer.end() - 10 * window_size, buffer.end()};
}
}
auto current_time = std::chrono::steady_clock::now();
const float elapsed_seconds =
std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
started_time)
.count() /
1000.;
if (speech_started && elapsed_seconds > 0.2) {
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
recognizer.Decode(&stream);
OfflineRecognizerResult result = recognizer.GetResult(&stream);
display.UpdateText(result.text);
display.Display();
started_time = std::chrono::steady_clock::now();
}
while (!vad.IsEmpty()) {
auto segment = vad.Front();
vad.Pop();
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(sample_rate, segment.samples.data(),
segment.samples.size());
recognizer.Decode(&stream);
OfflineRecognizerResult result = recognizer.GetResult(&stream);
display.UpdateText(result.text);
display.FinalizeCurrentSentence();
display.Display();
buffer.clear();
offset = 0;
speech_started = false;
}
}
return 0;
}

View File

@@ -0,0 +1,52 @@
// Copyright (c) 2025 Xiaomi Corporation
import 'dart:io';
import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
import './init.dart';
void main(List<String> arguments) async {
await initSherpaOnnx();
final parser = ArgParser()
..addOption('model', help: 'Path to the Zipformer CTC model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption('input-wav', help: 'Path to input.wav to transcribe');
final res = parser.parse(arguments);
if (res['model'] == null ||
res['tokens'] == null ||
res['input-wav'] == null) {
print(parser.usage);
exit(1);
}
final model = res['model'] as String;
final tokens = res['tokens'] as String;
final inputWav = res['input-wav'] as String;
final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model);
final modelConfig = sherpa_onnx.OfflineModelConfig(
zipformerCtc: zipformerCtc,
tokens: tokens,
debug: true,
numThreads: 1,
);
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
final recognizer = sherpa_onnx.OfflineRecognizer(config);
final waveData = sherpa_onnx.readWave(inputWav);
final stream = recognizer.createStream();
stream.acceptWaveform(
samples: waveData.samples, sampleRate: waveData.sampleRate);
recognizer.decode(stream);
final result = recognizer.getResult(stream);
print(result.text);
stream.free();
recognizer.free();
}

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -ex
dart pub get
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
dart run \
./bin/zipformer-ctc.dart \
--model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
--tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
--input-wav ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav

View File

@@ -0,0 +1,118 @@
// Copyright (c) 2025 Xiaomi Corporation
import 'dart:io';
import 'dart:typed_data';
import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
import './init.dart';
void main(List<String> arguments) async {
await initSherpaOnnx();
final parser = ArgParser()
..addOption('silero-vad', help: 'Path to silero_vad.onnx')
..addOption('model', help: 'Path to the Zipformer CTC model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption('input-wav', help: 'Path to input.wav to transcribe');
final res = parser.parse(arguments);
if (res['silero-vad'] == null ||
res['model'] == null ||
res['tokens'] == null ||
res['input-wav'] == null) {
print(parser.usage);
exit(1);
}
// create VAD
final sileroVad = res['silero-vad'] as String;
final sileroVadConfig = sherpa_onnx.SileroVadModelConfig(
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);
final vadConfig = sherpa_onnx.VadModelConfig(
sileroVad: sileroVadConfig,
numThreads: 1,
debug: true,
);
final vad = sherpa_onnx.VoiceActivityDetector(
config: vadConfig, bufferSizeInSeconds: 10);
// create offline recognizer
final model = res['model'] as String;
final tokens = res['tokens'] as String;
final inputWav = res['input-wav'] as String;
final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model);
final modelConfig = sherpa_onnx.OfflineModelConfig(
zipformerCtc: zipformerCtc,
tokens: tokens,
debug: true,
numThreads: 1,
);
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
final recognizer = sherpa_onnx.OfflineRecognizer(config);
final waveData = sherpa_onnx.readWave(inputWav);
if (waveData.sampleRate != 16000) {
print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}');
exit(1);
}
int numSamples = waveData.samples.length;
int numIter = numSamples ~/ vadConfig.sileroVad.windowSize;
for (int i = 0; i != numIter; ++i) {
int start = i * vadConfig.sileroVad.windowSize;
vad.acceptWaveform(Float32List.sublistView(
waveData.samples, start, start + vadConfig.sileroVad.windowSize));
while (!vad.isEmpty()) {
final samples = vad.front().samples;
final startTime = vad.front().start.toDouble() / waveData.sampleRate;
final endTime =
startTime + samples.length.toDouble() / waveData.sampleRate;
final stream = recognizer.createStream();
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
recognizer.decode(stream);
final result = recognizer.getResult(stream);
stream.free();
print(
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
vad.pop();
}
}
vad.flush();
while (!vad.isEmpty()) {
final samples = vad.front().samples;
final startTime = vad.front().start.toDouble() / waveData.sampleRate;
final endTime = startTime + samples.length.toDouble() / waveData.sampleRate;
final stream = recognizer.createStream();
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
recognizer.decode(stream);
final result = recognizer.getResult(stream);
stream.free();
print(
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
vad.pop();
}
vad.free();
recognizer.free();
}

View File

@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -ex
dart pub get
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi
if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi
dart run \
./bin/zipformer-ctc.dart \
--silero-vad ./silero_vad.onnx \
--model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
--tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
--input-wav ./lei-jun-test.wav

View File

@@ -75,6 +75,9 @@ class OfflineDecodeFiles
[Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")]
public string NeMoCtc { get; set; } = string.Empty; public string NeMoCtc { get; set; } = string.Empty;
[Option("zipformer-ctc", Required = false, HelpText = "Path to model.onnx. Used only for Zipformer CTC models")]
public string ZipformerCtc { get; set; } = string.Empty;
[Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")] [Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")]
public string DolphinModel { get; set; } = string.Empty; public string DolphinModel { get; set; } = string.Empty;
@@ -240,6 +243,10 @@ to download pre-trained Tdnn models.
{ {
config.ModelConfig.Dolphin.Model = options.DolphinModel; config.ModelConfig.Dolphin.Model = options.DolphinModel;
} }
else if (!string.IsNullOrEmpty(options.ZipformerCtc))
{
config.ModelConfig.ZipformerCtc.Model = options.ZipformerCtc;
}
else if (!string.IsNullOrEmpty(options.TeleSpeechCtc)) else if (!string.IsNullOrEmpty(options.TeleSpeechCtc))
{ {
config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc;

View File

@@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -ex
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
dotnet run \
--tokens=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
--zipformer-ctc=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
--num-threads=1 \
--files ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav \
./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/1.wav \
./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/8k.wav

View File

@@ -104,6 +104,27 @@ class OfflineDolphinModelConfig {
final String model; final String model;
} }
class OfflineZipformerCtcModelConfig {
const OfflineZipformerCtcModelConfig({this.model = ''});
factory OfflineZipformerCtcModelConfig.fromJson(Map<String, dynamic> json) {
return OfflineZipformerCtcModelConfig(
model: json['model'] as String? ?? '',
);
}
@override
String toString() {
return 'OfflineZipformerCtcModelConfig(model: $model)';
}
Map<String, dynamic> toJson() => {
'model': model,
};
final String model;
}
class OfflineWhisperModelConfig { class OfflineWhisperModelConfig {
const OfflineWhisperModelConfig( const OfflineWhisperModelConfig(
{this.encoder = '', {this.encoder = '',
@@ -288,6 +309,7 @@ class OfflineModelConfig {
this.moonshine = const OfflineMoonshineModelConfig(), this.moonshine = const OfflineMoonshineModelConfig(),
this.fireRedAsr = const OfflineFireRedAsrModelConfig(), this.fireRedAsr = const OfflineFireRedAsrModelConfig(),
this.dolphin = const OfflineDolphinModelConfig(), this.dolphin = const OfflineDolphinModelConfig(),
this.zipformerCtc = const OfflineZipformerCtcModelConfig(),
required this.tokens, required this.tokens,
this.numThreads = 1, this.numThreads = 1,
this.debug = true, this.debug = true,
@@ -336,6 +358,10 @@ class OfflineModelConfig {
? OfflineDolphinModelConfig.fromJson( ? OfflineDolphinModelConfig.fromJson(
json['dolphin'] as Map<String, dynamic>) json['dolphin'] as Map<String, dynamic>)
: const OfflineDolphinModelConfig(), : const OfflineDolphinModelConfig(),
zipformerCtc: json['zipformerCtc'] != null
? OfflineZipformerCtcModelConfig.fromJson(
json['zipformerCtc'] as Map<String, dynamic>)
: const OfflineZipformerCtcModelConfig(),
tokens: json['tokens'] as String, tokens: json['tokens'] as String,
numThreads: json['numThreads'] as int? ?? 1, numThreads: json['numThreads'] as int? ?? 1,
debug: json['debug'] as bool? ?? true, debug: json['debug'] as bool? ?? true,
@@ -349,7 +375,7 @@ class OfflineModelConfig {
@override @override
String toString() { String toString() {
return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, zipformerCtc: $zipformerCtc, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)';
} }
Map<String, dynamic> toJson() => { Map<String, dynamic> toJson() => {
@@ -362,6 +388,7 @@ class OfflineModelConfig {
'moonshine': moonshine.toJson(), 'moonshine': moonshine.toJson(),
'fireRedAsr': fireRedAsr.toJson(), 'fireRedAsr': fireRedAsr.toJson(),
'dolphin': dolphin.toJson(), 'dolphin': dolphin.toJson(),
'zipformerCtc': zipformerCtc.toJson(),
'tokens': tokens, 'tokens': tokens,
'numThreads': numThreads, 'numThreads': numThreads,
'debug': debug, 'debug': debug,
@@ -381,6 +408,7 @@ class OfflineModelConfig {
final OfflineMoonshineModelConfig moonshine; final OfflineMoonshineModelConfig moonshine;
final OfflineFireRedAsrModelConfig fireRedAsr; final OfflineFireRedAsrModelConfig fireRedAsr;
final OfflineDolphinModelConfig dolphin; final OfflineDolphinModelConfig dolphin;
final OfflineZipformerCtcModelConfig zipformerCtc;
final String tokens; final String tokens;
final int numThreads; final int numThreads;
@@ -578,6 +606,8 @@ class OfflineRecognizer {
config.model.fireRedAsr.decoder.toNativeUtf8(); config.model.fireRedAsr.decoder.toNativeUtf8();
c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8(); c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8();
c.ref.model.zipformerCtc.model =
config.model.zipformerCtc.model.toNativeUtf8();
c.ref.model.tokens = config.model.tokens.toNativeUtf8(); c.ref.model.tokens = config.model.tokens.toNativeUtf8();
@@ -623,6 +653,7 @@ class OfflineRecognizer {
calloc.free(c.ref.model.modelType); calloc.free(c.ref.model.modelType);
calloc.free(c.ref.model.provider); calloc.free(c.ref.model.provider);
calloc.free(c.ref.model.tokens); calloc.free(c.ref.model.tokens);
calloc.free(c.ref.model.zipformerCtc.model);
calloc.free(c.ref.model.dolphin.model); calloc.free(c.ref.model.dolphin.model);
calloc.free(c.ref.model.fireRedAsr.decoder); calloc.free(c.ref.model.fireRedAsr.decoder);
calloc.free(c.ref.model.fireRedAsr.encoder); calloc.free(c.ref.model.fireRedAsr.encoder);

View File

@@ -266,6 +266,10 @@ final class SherpaOnnxOfflineDolphinModelConfig extends Struct {
external Pointer<Utf8> model; external Pointer<Utf8> model;
} }
final class SherpaOnnxOfflineZipformerCtcModelConfig extends Struct {
external Pointer<Utf8> model;
}
final class SherpaOnnxOfflineWhisperModelConfig extends Struct { final class SherpaOnnxOfflineWhisperModelConfig extends Struct {
external Pointer<Utf8> encoder; external Pointer<Utf8> encoder;
external Pointer<Utf8> decoder; external Pointer<Utf8> decoder;
@@ -333,6 +337,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct {
external SherpaOnnxOfflineMoonshineModelConfig moonshine; external SherpaOnnxOfflineMoonshineModelConfig moonshine;
external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr;
external SherpaOnnxOfflineDolphinModelConfig dolphin; external SherpaOnnxOfflineDolphinModelConfig dolphin;
external SherpaOnnxOfflineZipformerCtcModelConfig zipformerCtc;
} }
final class SherpaOnnxOfflineRecognizerConfig extends Struct { final class SherpaOnnxOfflineRecognizerConfig extends Struct {

View File

@@ -28,6 +28,8 @@ func main() {
flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model") flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model")
flag.StringVar(&config.ModelConfig.ZipformerCtc.Model, "zipformer-ctc", "", "Path to the Zipformer CTC model")
flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model") flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model")
flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model") flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model")

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env bash
set -ex
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
go mod tidy
go build
./non-streaming-decode-files \
--zipformer-ctc ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
--tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
--debug 0 \
./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav

View File

@@ -15,6 +15,7 @@ export { Samples,
OfflineTdnnModelConfig, OfflineTdnnModelConfig,
OfflineSenseVoiceModelConfig, OfflineSenseVoiceModelConfig,
OfflineMoonshineModelConfig, OfflineMoonshineModelConfig,
OfflineZipformerCtcModelConfig,
OfflineModelConfig, OfflineModelConfig,
OfflineLMConfig, OfflineLMConfig,
OfflineRecognizerConfig, OfflineRecognizerConfig,

View File

@@ -45,7 +45,23 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig(
return c; return c;
} }
static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinfig( static SherpaOnnxOfflineZipformerCtcModelConfig
GetOfflineZipformerCtcModelConfig(Napi::Object obj) {
SherpaOnnxOfflineZipformerCtcModelConfig c;
memset(&c, 0, sizeof(c));
if (!obj.Has("zipformerCtc") || !obj.Get("zipformerCtc").IsObject()) {
return c;
}
Napi::Object o = obj.Get("zipformerCtc").As<Napi::Object>();
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
return c;
}
static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinModelConfig(
Napi::Object obj) { Napi::Object obj) {
SherpaOnnxOfflineDolphinModelConfig c; SherpaOnnxOfflineDolphinModelConfig c;
memset(&c, 0, sizeof(c)); memset(&c, 0, sizeof(c));
@@ -185,7 +201,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
c.sense_voice = GetOfflineSenseVoiceModelConfig(o); c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
c.moonshine = GetOfflineMoonshineModelConfig(o); c.moonshine = GetOfflineMoonshineModelConfig(o);
c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o); c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o);
c.dolphin = GetOfflineDolphinfig(o); c.dolphin = GetOfflineDolphinModelConfig(o);
c.zipformer_ctc = GetOfflineZipformerCtcModelConfig(o);
SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -312,6 +329,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder); SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder);
SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model); SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model);
SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model);
SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); SHERPA_ONNX_DELETE_C_STR(c.model_config.provider);

View File

@@ -55,6 +55,10 @@ export class OfflineDolphinModelConfig {
public model: string = ''; public model: string = '';
} }
export class OfflineZipformerCtcModelConfig {
public model: string = '';
}
export class OfflineWhisperModelConfig { export class OfflineWhisperModelConfig {
public encoder: string = ''; public encoder: string = '';
public decoder: string = ''; public decoder: string = '';
@@ -97,6 +101,7 @@ export class OfflineModelConfig {
public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig(); public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig();
public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig(); public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig();
public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig(); public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig();
public zipformerCtc: OfflineZipformerCtcModelConfig = new OfflineZipformerCtcModelConfig();
} }
export class OfflineLMConfig { export class OfflineLMConfig {

View File

@@ -0,0 +1,50 @@
// Copyright 2025 Xiaomi Corporation
// This file shows how to use an offline Zipformer CTC model,
// i.e., non-streaming Zipformer CTC model,
// to decode files.
import com.k2fsa.sherpa.onnx.*;
public class NonStreamingDecodeFileZipformerCtc {
public static void main(String[] args) {
// please refer to
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
// to download model files
String model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
String tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
String waveFilename = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav";
WaveReader reader = new WaveReader(waveFilename);
OfflineZipformerCtcModelConfig zipformerCtc =
OfflineZipformerCtcModelConfig.builder().setModel(model).build();
OfflineModelConfig modelConfig =
OfflineModelConfig.builder()
.setZipformerCtc(zipformerCtc)
.setTokens(tokens)
.setNumThreads(1)
.setDebug(true)
.build();
OfflineRecognizerConfig config =
OfflineRecognizerConfig.builder()
.setOfflineModelConfig(modelConfig)
.setDecodingMethod("greedy_search")
.build();
OfflineRecognizer recognizer = new OfflineRecognizer(config);
OfflineStream stream = recognizer.createStream();
stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
recognizer.decode(stream);
String text = recognizer.getResult(stream).getText();
System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
stream.release();
recognizer.release();
}
}

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -ex
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
mkdir -p ../build
pushd ../build
cmake \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
-DSHERPA_ONNX_ENABLE_JNI=ON \
..
make -j4
ls -lh lib
popd
fi
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
pushd ../sherpa-onnx/java-api
make
popd
fi
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
java \
-Djava.library.path=$PWD/../build/lib \
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
NonStreamingDecodeFileZipformerCtc.java

View File

@@ -253,6 +253,13 @@ function testOfflineAsr() {
rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2
fi fi
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
out_filename=test_offline_asr.jar out_filename=test_offline_asr.jar
kotlinc-jvm -include-runtime -d $out_filename \ kotlinc-jvm -include-runtime -d $out_filename \
test_offline_asr.kt \ test_offline_asr.kt \

View File

@@ -1,7 +1,7 @@
package com.k2fsa.sherpa.onnx package com.k2fsa.sherpa.onnx
fun main() { fun main() {
val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25) val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25, 31)
for (type in types) { for (type in types) {
test(type) test(type)
} }
@@ -19,6 +19,7 @@ fun test(type: Int) {
21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav" 21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav"
24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav" 24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav"
25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav" 25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav"
31 -> "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
else -> null else -> null
} }

View File

@@ -123,6 +123,7 @@ The following tables list the examples in this folder.
|[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)|
|[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)|
|[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
|[./test_asr_non_streaming_zipformer_ctc.js](./test_asr_non_streaming_zipformer_ctc.js)|Non-streaming speech recognition from a file using a Zipformer CTC model with greedy search|
|[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search| |[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search|
|[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search| |[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search|
|[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
@@ -137,6 +138,7 @@ The following tables list the examples in this folder.
|[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
|[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)| |[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)|
|[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
|[./test_vad_asr_non_streaming_zipformer_ctc_microphone.js](./test_vad_asr_non_streaming_zipformer_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer CTC model with greedy search|
|[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
|[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
@@ -372,6 +374,21 @@ rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js
``` ```
### Non-streaming speech recognition with Zipformer CTC models
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
node ./test_asr_non_streaming_zipformer_ctc.js
# To run VAD + non-streaming ASR with Paraformer using a microphone
npm install naudiodon2
node ./test_vad_asr_non_streaming_zipformer_ctc_microphone.js
```
### Non-streaming speech recognition with NeMo CTC models ### Non-streaming speech recognition with NeMo CTC models
```bash ```bash

View File

@@ -0,0 +1,46 @@
// Copyright (c) 2025 Xiaomi Corporation
const sherpa_onnx = require('sherpa-onnx-node');
// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const config = {
'featConfig': {
'sampleRate': 16000,
'featureDim': 80,
},
'modelConfig': {
'zipformerCtc': {
'model': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
},
'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
'numThreads': 2,
'provider': 'cpu',
'debug': 1,
}
};
const waveFilename =
'./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
const recognizer = new sherpa_onnx.OfflineRecognizer(config);
console.log('Started')
let start = Date.now();
const stream = recognizer.createStream();
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
recognizer.decode(stream);
result = recognizer.getResult(stream)
let stop = Date.now();
console.log('Done')
const elapsed_seconds = (stop - start) / 1000;
const duration = wave.samples.length / wave.sampleRate;
const real_time_factor = elapsed_seconds / duration;
console.log('Wave duration', duration.toFixed(3), 'seconds')
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
console.log(
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
real_time_factor.toFixed(3))
console.log(waveFilename)
console.log('result\n', result)

View File

@@ -0,0 +1,109 @@
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
//
const portAudio = require('naudiodon2');
// console.log(portAudio.getDevices());
const sherpa_onnx = require('sherpa-onnx-node');
function createRecognizer() {
// Please download test files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
const config = {
'featConfig': {
'sampleRate': 16000,
'featureDim': 80,
},
'modelConfig': {
'zipformerCtc': {
'model':
'./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
},
'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
'numThreads': 2,
'provider': 'cpu',
'debug': 1,
}
};
return new sherpa_onnx.OfflineRecognizer(config);
}
function createVad() {
// please download silero_vad.onnx from
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
const config = {
sileroVad: {
model: './silero_vad.onnx',
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
windowSize: 512,
},
sampleRate: 16000,
debug: true,
numThreads: 1,
};
const bufferSizeInSeconds = 60;
return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
}
const recognizer = createRecognizer();
const vad = createVad();
const bufferSizeInSeconds = 30;
const buffer =
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
const ai = new portAudio.AudioIO({
inOptions: {
channelCount: 1,
closeOnError: true, // Close the stream if an audio error is detected, if
// set false then just log the error
deviceId: -1, // Use -1 or omit the deviceId to select the default device
sampleFormat: portAudio.SampleFormatFloat32,
sampleRate: vad.config.sampleRate
}
});
let printed = false;
let index = 0;
ai.on('data', data => {
const windowSize = vad.config.sileroVad.windowSize;
buffer.push(new Float32Array(data.buffer));
while (buffer.size() > windowSize) {
const samples = buffer.get(buffer.head(), windowSize);
buffer.pop(windowSize);
vad.acceptWaveform(samples);
}
while (!vad.isEmpty()) {
const segment = vad.front();
vad.pop();
const stream = recognizer.createStream();
stream.acceptWaveform({
samples: segment.samples,
sampleRate: recognizer.config.featConfig.sampleRate
});
recognizer.decode(stream);
const r = recognizer.getResult(stream);
if (r.text.length > 0) {
const text = r.text.toLowerCase().trim();
console.log(`${index}: ${text}`);
const filename = `${index}-${text}-${
new Date()
.toLocaleTimeString('en-US', {hour12: false})
.split(' ')[0]}.wav`;
sherpa_onnx.writeWave(
filename,
{samples: segment.samples, sampleRate: vad.config.sampleRate});
index += 1;
}
}
});
ai.start();
console.log('Started! Please speak')

View File

@@ -154,6 +154,23 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
node ./test-offline-dolphin-ctc.js node ./test-offline-dolphin-ctc.js
``` ```
## ./test-offline-zipformer-ctc.js
[./test-offline-zipformer-ctc.js](./test-offline-zipformer-ctc.js) demonstrates
how to decode a file with a Zipformer CTC model. In the code we use
[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese).
You can use the following command to run it:
```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
node ./test-offline-zipformer-ctc.js
```
## ./test-offline-nemo-ctc.js ## ./test-offline-nemo-ctc.js
[./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates

View File

@@ -0,0 +1,35 @@
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
//
const fs = require('fs');
const {Readable} = require('stream');
const wav = require('wav');
const sherpa_onnx = require('sherpa-onnx');
function createOfflineRecognizer() {
let config = {
modelConfig: {
zipformerCtc: {
model: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
},
tokens: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
}
};
return sherpa_onnx.createOfflineRecognizer(config);
}
const recognizer = createOfflineRecognizer();
const stream = recognizer.createStream();
const waveFilename =
'./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
const wave = sherpa_onnx.readWave(waveFilename);
stream.acceptWaveform(wave.sampleRate, wave.samples);
recognizer.decode(stream);
const text = recognizer.getResult(stream).text;
console.log(text);
stream.free();
recognizer.free();

View File

@@ -9,3 +9,4 @@ sense_voice
telespeech_ctc telespeech_ctc
moonshine moonshine
dolphin_ctc dolphin_ctc
zipformer_ctc

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
ls -lh lib
popd
fi
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./zipformer_ctc.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./zipformer_ctc

View File

@@ -0,0 +1,76 @@
{ Copyright (c) 2025 Xiaomi Corporation }
{
This file shows how to use a non-streaming Zipformer CTC model
to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program zipformer_ctc;
{$mode objfpc}
uses
sherpa_onnx,
DateUtils,
SysUtils;
var
Wave: TSherpaOnnxWave;
WaveFilename: AnsiString;
Config: TSherpaOnnxOfflineRecognizerConfig;
Recognizer: TSherpaOnnxOfflineRecognizer;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
Start: TDateTime;
Stop: TDateTime;
Elapsed: Single;
Duration: Single;
RealTimeFactor: Single;
begin
Initialize(Config);
Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
WaveFilename := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
Wave := SherpaOnnxReadWave(WaveFilename);
Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
Stream := Recognizer.CreateStream();
Start := Now;
Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Stop := Now;
Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
Duration := Length(Wave.Samples) / Wave.SampleRate;
RealTimeFactor := Elapsed / Duration;
WriteLn(RecognitionResult.ToString);
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
WriteLn(Format('Wave duration %.3f s', [Duration]));
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
{Free resources to avoid memory leak.
Note: You don't need to invoke them for this simple script.
However, you have to invoke them in your own large/complex project.
}
FreeAndNil(Stream);
FreeAndNil(Recognizer);
end.

View File

@@ -2,3 +2,5 @@
vad_with_whisper vad_with_whisper
vad_with_sense_voice vad_with_sense_voice
vad_with_moonshine vad_with_moonshine
vad_with_zipformer_ctc
vad_with_dolphin

View File

@@ -0,0 +1,50 @@
#!/usr/bin/env bash
set -ex
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..
cmake --build . --target install --config Release
popd
fi
if [[ ! -f ./silero_vad.onnx ]]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
fi
if [ ! -f ./lei-jun-test.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
fi
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
fi
fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./vad_with_zipformer_ctc.pas
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
./vad_with_zipformer_ctc

View File

@@ -0,0 +1,135 @@
{ Copyright (c) 2025 Xiaomi Corporation }
{
This file shows how to use a non-streaming Zipformer CTC model
with silero VAD to decode files.
You can download the model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
}
program vad_with_zipformer_ctc;
{$mode objfpc}
uses
sherpa_onnx,
SysUtils;
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
var
Config: TSherpaOnnxVadModelConfig;
SampleRate: Integer;
WindowSize: Integer;
begin
Initialize(Config);
SampleRate := 16000; {Please don't change it unless you know the details}
WindowSize := 512; {Please don't change it unless you know the details}
Config.SileroVad.Model := './silero_vad.onnx';
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 1;
Config.Debug:= True;
Config.Provider:= 'cpu';
Config.SampleRate := SampleRate;
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
end;
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
var
Config: TSherpaOnnxOfflineRecognizerConfig;
begin
Initialize(Config);
Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
Config.ModelConfig.Provider := 'cpu';
Config.ModelConfig.NumThreads := 1;
Config.ModelConfig.Debug := False;
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
end;
var
Wave: TSherpaOnnxWave;
Recognizer: TSherpaOnnxOfflineRecognizer;
Vad: TSherpaOnnxVoiceActivityDetector;
Offset: Integer;
WindowSize: Integer;
SpeechSegment: TSherpaOnnxSpeechSegment;
Start: Single;
Duration: Single;
Stream: TSherpaOnnxOfflineStream;
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
begin
Vad := CreateVad();
Recognizer := CreateOfflineRecognizer();
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
if Wave.SampleRate <> Vad.Config.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d. Given: %d',
[Vad.Config.SampleRate, Wave.SampleRate]));
Exit;
end;
WindowSize := Vad.Config.SileroVad.WindowSize;
Offset := 0;
while Offset + WindowSize <= Length(Wave.Samples) do
begin
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
Offset += WindowSize;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
end;
Vad.Flush;
while not Vad.IsEmpty do
begin
SpeechSegment := Vad.Front();
Vad.Pop();
Stream := Recognizer.CreateStream();
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
Recognizer.Decode(Stream);
RecognitionResult := Recognizer.GetResult(Stream);
Start := SpeechSegment.Start / Wave.SampleRate;
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
WriteLn(Format('%.3f -- %.3f %s',
[Start, Start + Duration, RecognitionResult.Text]));
FreeAndNil(Stream);
end;
FreeAndNil(Recognizer);
FreeAndNil(Vad);
end.

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""
This file shows how to use a non-streaming zipformer CTC model from icefall
to decode files.
Please download model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
"""
from pathlib import Path
import sherpa_onnx
import soundfile as sf
def create_recognizer():
model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"
tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"
test_wav = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
if not Path(model).is_file() or not Path(test_wav).is_file():
raise ValueError(
"""Please download model files from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
"""
)
return (
sherpa_onnx.OfflineRecognizer.from_zipformer_ctc(
model=model,
tokens=tokens,
debug=True,
),
test_wav,
)
def main():
recognizer, wave_filename = create_recognizer()
audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
audio = audio[:, 0] # only use the first channel
# audio is a 1-D float32 numpy array normalized to the range [-1, 1]
# sample_rate does not need to be 16000 Hz
stream = recognizer.create_stream()
stream.accept_waveform(sample_rate, audio)
recognizer.decode_stream(stream)
print(wave_filename)
print(stream.result)
if __name__ == "__main__":
main()

View File

@@ -344,7 +344,7 @@ def get_models():
""", """,
), ),
Model( Model(
model_name="sherpa-onnx-streaming-zipformer-ctc-fp16-zh-2025-06-30", model_name="sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30",
idx=19, idx=19,
lang="zh", lang="zh",
short_name="large_zipformer_fp16", short_name="large_zipformer_fp16",
@@ -360,6 +360,26 @@ def get_models():
ls -lh ls -lh
popd
""",
),
Model(
model_name="sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30",
idx=20,
lang="zh",
short_name="large_zipformer_int8",
rule_fsts="itn_zh_number.fst",
cmd="""
if [ ! -f itn_zh_number.fst ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
fi
pushd $model_name
rm -fv bpe.model
rm -rf test_wavs
ls -lh
popd popd
""", """,
), ),

View File

@@ -548,6 +548,23 @@ def get_models():
ls -lh ls -lh
popd
""",
),
Model(
model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03",
idx=31,
lang="zh",
lang2="Chinese",
short_name="zipformer_2025_07_03",
cmd="""
pushd $model_name
rm -rfv test_wavs
rm -rfv bbpe.model
ls -lh
popd popd
""", """,
), ),

View File

@@ -27,6 +27,7 @@ namespace SherpaOnnx
Moonshine = new OfflineMoonshineModelConfig(); Moonshine = new OfflineMoonshineModelConfig();
FireRedAsr = new OfflineFireRedAsrModelConfig(); FireRedAsr = new OfflineFireRedAsrModelConfig();
Dolphin = new OfflineDolphinModelConfig(); Dolphin = new OfflineDolphinModelConfig();
ZipformerCtc = new OfflineZipformerCtcModelConfig();
} }
public OfflineTransducerModelConfig Transducer; public OfflineTransducerModelConfig Transducer;
public OfflineParaformerModelConfig Paraformer; public OfflineParaformerModelConfig Paraformer;
@@ -60,5 +61,6 @@ namespace SherpaOnnx
public OfflineMoonshineModelConfig Moonshine; public OfflineMoonshineModelConfig Moonshine;
public OfflineFireRedAsrModelConfig FireRedAsr; public OfflineFireRedAsrModelConfig FireRedAsr;
public OfflineDolphinModelConfig Dolphin; public OfflineDolphinModelConfig Dolphin;
public OfflineZipformerCtcModelConfig ZipformerCtc;
} }
} }

View File

@@ -0,0 +1,18 @@
/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
[StructLayout(LayoutKind.Sequential)]
public struct OfflineZipformerCtcModelConfig
{
public OfflineZipformerCtcModelConfig()
{
Model = "";
}
[MarshalAs(UnmanagedType.LPStr)]
public string Model;
}
}

View File

@@ -0,0 +1 @@
../../../../go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh

View File

@@ -398,6 +398,10 @@ type OfflineNemoEncDecCtcModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx Model string // Path to the model, e.g., model.onnx or model.int8.onnx
} }
type OfflineZipformerCtcModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
}
type OfflineDolphinModelConfig struct { type OfflineDolphinModelConfig struct {
Model string // Path to the model, e.g., model.onnx or model.int8.onnx Model string // Path to the model, e.g., model.onnx or model.int8.onnx
} }
@@ -439,16 +443,17 @@ type OfflineLMConfig struct {
} }
type OfflineModelConfig struct { type OfflineModelConfig struct {
Transducer OfflineTransducerModelConfig Transducer OfflineTransducerModelConfig
Paraformer OfflineParaformerModelConfig Paraformer OfflineParaformerModelConfig
NemoCTC OfflineNemoEncDecCtcModelConfig NemoCTC OfflineNemoEncDecCtcModelConfig
Whisper OfflineWhisperModelConfig Whisper OfflineWhisperModelConfig
Tdnn OfflineTdnnModelConfig Tdnn OfflineTdnnModelConfig
SenseVoice OfflineSenseVoiceModelConfig SenseVoice OfflineSenseVoiceModelConfig
Moonshine OfflineMoonshineModelConfig Moonshine OfflineMoonshineModelConfig
FireRedAsr OfflineFireRedAsrModelConfig FireRedAsr OfflineFireRedAsrModelConfig
Dolphin OfflineDolphinModelConfig Dolphin OfflineDolphinModelConfig
Tokens string // Path to tokens.txt ZipformerCtc OfflineZipformerCtcModelConfig
Tokens string // Path to tokens.txt
// Number of threads to use for neural network computation // Number of threads to use for neural network computation
NumThreads int NumThreads int
@@ -540,6 +545,7 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher
c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder) c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder)
c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model) c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model)
c.model_config.zipformer_ctc.model = C.CString(config.ModelConfig.ZipformerCtc.Model)
c.model_config.tokens = C.CString(config.ModelConfig.Tokens) c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
@@ -653,11 +659,22 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig)
C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder)) C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder))
c.model_config.fire_red_asr.encoder = nil c.model_config.fire_red_asr.encoder = nil
} }
if c.model_config.fire_red_asr.decoder != nil { if c.model_config.fire_red_asr.decoder != nil {
C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder)) C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder))
c.model_config.fire_red_asr.decoder = nil c.model_config.fire_red_asr.decoder = nil
} }
if c.model_config.dolphin.model != nil {
C.free(unsafe.Pointer(c.model_config.dolphin.model))
c.model_config.dolphin.model = nil
}
if c.model_config.zipformer_ctc.model != nil {
C.free(unsafe.Pointer(c.model_config.zipformer_ctc.model))
c.model_config.zipformer_ctc.model = nil
}
if c.model_config.tokens != nil { if c.model_config.tokens != nil {
C.free(unsafe.Pointer(c.model_config.tokens)) C.free(unsafe.Pointer(c.model_config.tokens))
c.model_config.tokens = nil c.model_config.tokens = nil

View File

@@ -212,6 +212,21 @@ def get_models():
git diff git diff
""", """,
), ),
Model(
model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03",
hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc",
ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc",
short_name="vad-asr-zh-zipformer-ctc",
cmd="""
pushd $model_name
mv model.int8.onnx ../zipformer-ctc.onnx
mv tokens.txt ../
popd
rm -rf $model_name
sed -i.bak 's/Zipformer/Zipformer CTC supporting Chinese 中文/g' ../index.html
git diff
""",
),
] ]
return models return models

View File

@@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig(
recognizer_config.model_config.dolphin.model = recognizer_config.model_config.dolphin.model =
SHERPA_ONNX_OR(config->model_config.dolphin.model, ""); SHERPA_ONNX_OR(config->model_config.dolphin.model, "");
recognizer_config.model_config.zipformer_ctc.model =
SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, "");
recognizer_config.lm_config.model = recognizer_config.lm_config.model =
SHERPA_ONNX_OR(config->lm_config.model, ""); SHERPA_ONNX_OR(config->lm_config.model, "");
recognizer_config.lm_config.scale = recognizer_config.lm_config.scale =

View File

@@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig {
const char *model; const char *model;
} SherpaOnnxOfflineDolphinModelConfig; } SherpaOnnxOfflineDolphinModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineZipformerCtcModelConfig {
const char *model;
} SherpaOnnxOfflineZipformerCtcModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
SherpaOnnxOfflineTransducerModelConfig transducer; SherpaOnnxOfflineTransducerModelConfig transducer;
SherpaOnnxOfflineParaformerModelConfig paraformer; SherpaOnnxOfflineParaformerModelConfig paraformer;
@@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
SherpaOnnxOfflineMoonshineModelConfig moonshine; SherpaOnnxOfflineMoonshineModelConfig moonshine;
SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr; SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr;
SherpaOnnxOfflineDolphinModelConfig dolphin; SherpaOnnxOfflineDolphinModelConfig dolphin;
SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc;
} SherpaOnnxOfflineModelConfig; } SherpaOnnxOfflineModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {

View File

@@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create(
c.model_config.dolphin.model = config.model_config.dolphin.model.c_str(); c.model_config.dolphin.model = config.model_config.dolphin.model.c_str();
c.model_config.zipformer_ctc.model =
config.model_config.zipformer_ctc.model.c_str();
c.lm_config.model = config.lm_config.model.c_str(); c.lm_config.model = config.lm_config.model.c_str();
c.lm_config.scale = config.lm_config.scale; c.lm_config.scale = config.lm_config.scale;

View File

@@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig {
std::string model; std::string model;
}; };
struct SHERPA_ONNX_API OfflineZipformerCtcModelConfig {
std::string model;
};
struct SHERPA_ONNX_API OfflineMoonshineModelConfig { struct SHERPA_ONNX_API OfflineMoonshineModelConfig {
std::string preprocessor; std::string preprocessor;
std::string encoder; std::string encoder;
@@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig {
OfflineMoonshineModelConfig moonshine; OfflineMoonshineModelConfig moonshine;
OfflineFireRedAsrModelConfig fire_red_asr; OfflineFireRedAsrModelConfig fire_red_asr;
OfflineDolphinModelConfig dolphin; OfflineDolphinModelConfig dolphin;
OfflineZipformerCtcModelConfig zipformer_ctc;
}; };
struct SHERPA_ONNX_API OfflineLMConfig { struct SHERPA_ONNX_API OfflineLMConfig {

View File

@@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
const OfflineModelConfig &config) { const OfflineModelConfig &config) {
if (!config.dolphin.model.empty()) { if (!config.dolphin.model.empty()) {
return std::make_unique<OfflineDolphinModel>(config); return std::make_unique<OfflineDolphinModel>(config);
} else if (!config.nemo_ctc.model.empty()) {
return std::make_unique<OfflineNemoEncDecCtcModel>(config);
} else if (!config.tdnn.model.empty()) {
return std::make_unique<OfflineTdnnCtcModel>(config);
} else if (!config.zipformer_ctc.model.empty()) {
return std::make_unique<OfflineZipformerCtcModel>(config);
} else if (!config.wenet_ctc.model.empty()) {
return std::make_unique<OfflineWenetCtcModel>(config);
} else if (!config.telespeech_ctc.empty()) {
return std::make_unique<OfflineTeleSpeechCtcModel>(config);
} }
// TODO(fangjun): Refactor it. We don't need to use model_type here // TODO(fangjun): Refactor it. We don't need to use model_type here
@@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
Manager *mgr, const OfflineModelConfig &config) { Manager *mgr, const OfflineModelConfig &config) {
if (!config.dolphin.model.empty()) { if (!config.dolphin.model.empty()) {
return std::make_unique<OfflineDolphinModel>(mgr, config); return std::make_unique<OfflineDolphinModel>(mgr, config);
} else if (!config.nemo_ctc.model.empty()) {
return std::make_unique<OfflineNemoEncDecCtcModel>(mgr, config);
} else if (!config.tdnn.model.empty()) {
return std::make_unique<OfflineTdnnCtcModel>(mgr, config);
} else if (!config.zipformer_ctc.model.empty()) {
return std::make_unique<OfflineZipformerCtcModel>(mgr, config);
} else if (!config.wenet_ctc.model.empty()) {
return std::make_unique<OfflineWenetCtcModel>(mgr, config);
} else if (!config.telespeech_ctc.empty()) {
return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config);
} }
// TODO(fangjun): Refactor it. We don't need to use model_type here // TODO(fangjun): Refactor it. We don't need to use model_type here

View File

@@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java
java_files += OfflineFireRedAsrModelConfig.java java_files += OfflineFireRedAsrModelConfig.java
java_files += OfflineMoonshineModelConfig.java java_files += OfflineMoonshineModelConfig.java
java_files += OfflineNemoEncDecCtcModelConfig.java java_files += OfflineNemoEncDecCtcModelConfig.java
java_files += OfflineZipformerCtcModelConfig.java
java_files += OfflineSenseVoiceModelConfig.java java_files += OfflineSenseVoiceModelConfig.java
java_files += OfflineDolphinModelConfig.java java_files += OfflineDolphinModelConfig.java
java_files += OfflineModelConfig.java java_files += OfflineModelConfig.java

View File

@@ -11,6 +11,7 @@ public class OfflineModelConfig {
private final OfflineNemoEncDecCtcModelConfig nemo; private final OfflineNemoEncDecCtcModelConfig nemo;
private final OfflineSenseVoiceModelConfig senseVoice; private final OfflineSenseVoiceModelConfig senseVoice;
private final OfflineDolphinModelConfig dolphin; private final OfflineDolphinModelConfig dolphin;
private final OfflineZipformerCtcModelConfig zipformerCtc;
private final String teleSpeech; private final String teleSpeech;
private final String tokens; private final String tokens;
private final int numThreads; private final int numThreads;
@@ -28,6 +29,7 @@ public class OfflineModelConfig {
this.fireRedAsr = builder.fireRedAsr; this.fireRedAsr = builder.fireRedAsr;
this.moonshine = builder.moonshine; this.moonshine = builder.moonshine;
this.nemo = builder.nemo; this.nemo = builder.nemo;
this.zipformerCtc = builder.zipformerCtc;
this.senseVoice = builder.senseVoice; this.senseVoice = builder.senseVoice;
this.dolphin = builder.dolphin; this.dolphin = builder.dolphin;
this.teleSpeech = builder.teleSpeech; this.teleSpeech = builder.teleSpeech;
@@ -52,7 +54,7 @@ public class OfflineModelConfig {
return transducer; return transducer;
} }
public OfflineWhisperModelConfig getZipformer2Ctc() { public OfflineWhisperModelConfig getWhisper() {
return whisper; return whisper;
} }
@@ -68,6 +70,14 @@ public class OfflineModelConfig {
return dolphin; return dolphin;
} }
public OfflineNemoEncDecCtcModelConfig getNemo() {
return nemo;
}
public OfflineZipformerCtcModelConfig getZipformerCtc() {
return zipformerCtc;
}
public String getTokens() { public String getTokens() {
return tokens; return tokens;
} }
@@ -109,6 +119,7 @@ public class OfflineModelConfig {
private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build();
private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build(); private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build();
private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build(); private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build();
private OfflineZipformerCtcModelConfig zipformerCtc = OfflineZipformerCtcModelConfig.builder().build();
private String teleSpeech = ""; private String teleSpeech = "";
private String tokens = ""; private String tokens = "";
private int numThreads = 1; private int numThreads = 1;
@@ -142,6 +153,11 @@ public class OfflineModelConfig {
return this; return this;
} }
public Builder setZipformerCtc(OfflineZipformerCtcModelConfig zipformerCtc) {
this.zipformerCtc = zipformerCtc;
return this;
}
public Builder setTeleSpeech(String teleSpeech) { public Builder setTeleSpeech(String teleSpeech) {
this.teleSpeech = teleSpeech; this.teleSpeech = teleSpeech;
return this; return this;

View File

@@ -0,0 +1,32 @@
// Copyright 2025 Xiaomi Corporation
package com.k2fsa.sherpa.onnx;
public class OfflineZipformerCtcModelConfig {
private final String model;
private OfflineZipformerCtcModelConfig(Builder builder) {
this.model = builder.model;
}
public static Builder builder() {
return new Builder();
}
public String getModel() {
return model;
}
public static class Builder {
private String model = "";
public OfflineZipformerCtcModelConfig build() {
return new OfflineZipformerCtcModelConfig(this);
}
public Builder setModel(String model) {
this.model = model;
return this;
}
}
}

View File

@@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
ans.model_config.nemo_ctc.model = p; ans.model_config.nemo_ctc.model = p;
env->ReleaseStringUTFChars(s, p); env->ReleaseStringUTFChars(s, p);
// zipformer ctc
fid =
env->GetFieldID(model_config_cls, "zipformerCtc",
"Lcom/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig;");
jobject zipformer_ctc_config = env->GetObjectField(model_config, fid);
jclass zipformer_ctc_config_cls = env->GetObjectClass(zipformer_ctc_config);
fid =
env->GetFieldID(zipformer_ctc_config_cls, "model", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(zipformer_ctc_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model_config.zipformer_ctc.model = p;
env->ReleaseStringUTFChars(s, p);
// dolphin // dolphin
fid = env->GetFieldID(model_config_cls, "dolphin", fid = env->GetFieldID(model_config_cls, "dolphin",
"Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;"); "Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;");

View File

@@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig(
var model: String = "", var model: String = "",
) )
data class OfflineZipformerCtcModelConfig(
var model: String = "",
)
data class OfflineWhisperModelConfig( data class OfflineWhisperModelConfig(
var encoder: String = "", var encoder: String = "",
var decoder: String = "", var decoder: String = "",
@@ -64,6 +68,7 @@ data class OfflineModelConfig(
var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(),
var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(), var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(),
var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(), var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(),
var zipformerCtc: OfflineZipformerCtcModelConfig = OfflineZipformerCtcModelConfig(),
var teleSpeech: String = "", var teleSpeech: String = "",
var numThreads: Int = 1, var numThreads: Int = 1,
var debug: Boolean = false, var debug: Boolean = false,
@@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
modelType = "nemo_transducer", modelType = "nemo_transducer",
) )
} }
31 -> {
val modelDir = "sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03"
return OfflineModelConfig(
zipformerCtc = OfflineZipformerCtcModelConfig(
model = "$modelDir/model.int8.onnx",
),
tokens = "$modelDir/tokens.txt",
)
}
} }
return null return null
} }

View File

@@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
model = "$modelDir/model.onnx", model = "$modelDir/model.onnx",
), ),
tokens = "$modelDir/tokens.txt", tokens = "$modelDir/tokens.txt",
modelType = "zipformer2",
) )
} }
@@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
model = "$modelDir/model.fp16.onnx", model = "$modelDir/model.fp16.onnx",
), ),
tokens = "$modelDir/tokens.txt", tokens = "$modelDir/tokens.txt",
modelType = "zipformer2",
) )
} }

View File

@@ -284,6 +284,11 @@ type
function ToString: AnsiString; function ToString: AnsiString;
end; end;
TSherpaOnnxOfflineZipformerCtcModelConfig = record
Model: AnsiString;
function ToString: AnsiString;
end;
TSherpaOnnxOfflineWhisperModelConfig = record TSherpaOnnxOfflineWhisperModelConfig = record
Encoder: AnsiString; Encoder: AnsiString;
Decoder: AnsiString; Decoder: AnsiString;
@@ -346,6 +351,7 @@ type
Moonshine: TSherpaOnnxOfflineMoonshineModelConfig; Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig; FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig;
Dolphin: TSherpaOnnxOfflineDolphinModelConfig; Dolphin: TSherpaOnnxOfflineDolphinModelConfig;
ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
function ToString: AnsiString; function ToString: AnsiString;
end; end;
@@ -726,6 +732,9 @@ type
SherpaOnnxOfflineDolphinModelConfig = record SherpaOnnxOfflineDolphinModelConfig = record
Model: PAnsiChar; Model: PAnsiChar;
end; end;
SherpaOnnxOfflineZipformerCtcModelConfig = record
Model: PAnsiChar;
end;
SherpaOnnxOfflineWhisperModelConfig = record SherpaOnnxOfflineWhisperModelConfig = record
Encoder: PAnsiChar; Encoder: PAnsiChar;
Decoder: PAnsiChar; Decoder: PAnsiChar;
@@ -773,6 +782,7 @@ type
Moonshine: SherpaOnnxOfflineMoonshineModelConfig; Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig; FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig;
Dolphin: SherpaOnnxOfflineDolphinModelConfig; Dolphin: SherpaOnnxOfflineDolphinModelConfig;
ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig;
end; end;
SherpaOnnxOfflineRecognizerConfig = record SherpaOnnxOfflineRecognizerConfig = record
@@ -1536,6 +1546,12 @@ begin
[Self.Model]); [Self.Model]);
end; end;
function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString;
begin
Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)',
[Self.Model]);
end;
function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString;
begin begin
Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' + Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' +
@@ -1610,14 +1626,15 @@ begin
'SenseVoice := %s, ' + 'SenseVoice := %s, ' +
'Moonshine := %s, ' + 'Moonshine := %s, ' +
'FireRedAsr := %s, ' + 'FireRedAsr := %s, ' +
'Dolphin := %s' + 'Dolphin := %s, ' +
'ZipformerCtc := %s' +
')', ')',
[Self.Transducer.ToString, Self.Paraformer.ToString, [Self.Transducer.ToString, Self.Paraformer.ToString,
Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
Self.ModelType, Self.ModelingUnit, Self.BpeVocab, Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString, Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString,
Self.FireRedAsr.ToString, Self.Dolphin.ToString Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString
]); ]);
end; end;
@@ -1688,6 +1705,7 @@ begin
C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder); C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder);
C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model); C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model);
C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model);
C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
C.LMConfig.Scale := Config.LMConfig.Scale; C.LMConfig.Scale := Config.LMConfig.Scale;

View File

@@ -527,6 +527,87 @@ class OfflineRecognizer(object):
self.config = recognizer_config self.config = recognizer_config
return self return self
@classmethod
def from_zipformer_ctc(
cls,
model: str,
tokens: str,
num_threads: int = 1,
sample_rate: int = 16000,
feature_dim: int = 80,
decoding_method: str = "greedy_search",
debug: bool = False,
provider: str = "cpu",
rule_fsts: str = "",
rule_fars: str = "",
hr_dict_dir: str = "",
hr_rule_fsts: str = "",
hr_lexicon: str = "",
):
"""
Please refer to
`<https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/index.html>`_
to download pre-trained models for different languages, e.g., Chinese,
English, etc.
Args:
model:
Path to ``model.onnx``.
tokens:
Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
columns::
symbol integer_id
num_threads:
Number of threads for neural network computation.
sample_rate:
Sample rate of the training data used to train the model.
feature_dim:
Dimension of the feature used to train the model.
decoding_method:
Valid values are greedy_search.
debug:
True to show debug messages.
provider:
onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
rule_fsts:
If not empty, it specifies fsts for inverse text normalization.
If there are multiple fsts, they are separated by a comma.
rule_fars:
If not empty, it specifies fst archives for inverse text normalization.
If there are multiple archives, they are separated by a comma.
"""
self = cls.__new__(cls)
model_config = OfflineModelConfig(
zipformer_ctc=OfflineZipformerCtcModelConfig(model=model),
tokens=tokens,
num_threads=num_threads,
debug=debug,
provider=provider,
)
feat_config = FeatureExtractorConfig(
sampling_rate=sample_rate,
feature_dim=feature_dim,
)
recognizer_config = OfflineRecognizerConfig(
feat_config=feat_config,
model_config=model_config,
decoding_method=decoding_method,
rule_fsts=rule_fsts,
rule_fars=rule_fars,
hr=HomophoneReplacerConfig(
dict_dir=hr_dict_dir,
lexicon=hr_lexicon,
rule_fsts=hr_rule_fsts,
),
)
self.recognizer = _Recognizer(recognizer_config)
self.config = recognizer_config
return self
@classmethod @classmethod
def from_nemo_ctc( def from_nemo_ctc(
cls, cls,

View File

@@ -16,3 +16,6 @@ tts-kokoro-en
tts-kokoro-zh-en tts-kokoro-zh-en
speech-enhancement-gtcrn speech-enhancement-gtcrn
decode-file-sense-voice-with-hr decode-file-sense-voice-with-hr
test-version
zipformer-ctc-asr
dolphin-ctc-asr

View File

@@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig(
) )
} }
func sherpaOnnxOfflineZipformerCtcModelConfig(
model: String = ""
) -> SherpaOnnxOfflineZipformerCtcModelConfig {
return SherpaOnnxOfflineZipformerCtcModelConfig(
model: toCPointer(model)
)
}
func sherpaOnnxOfflineNemoEncDecCtcModelConfig( func sherpaOnnxOfflineNemoEncDecCtcModelConfig(
model: String = "" model: String = ""
) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig { ) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig {
@@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig(
senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(), senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(),
moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(), moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(),
fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(), fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(),
dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig() dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(),
zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig =
sherpaOnnxOfflineZipformerCtcModelConfig()
) -> SherpaOnnxOfflineModelConfig { ) -> SherpaOnnxOfflineModelConfig {
return SherpaOnnxOfflineModelConfig( return SherpaOnnxOfflineModelConfig(
transducer: transducer, transducer: transducer,
@@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig(
sense_voice: senseVoice, sense_voice: senseVoice,
moonshine: moonshine, moonshine: moonshine,
fire_red_asr: fireRedAsr, fire_red_asr: fireRedAsr,
dolphin: dolphin dolphin: dolphin,
zipformer_ctc: zipformerCtc
) )
} }

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -ex
if [ ! -d ../build-swift-macos ]; then
echo "Please run ../build-swift-macos.sh first!"
exit 1
fi
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then
echo "Please download the pre-trained model for testing."
echo "You can refer to"
echo ""
echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese"
echo ""
echo "for help"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
ls -lh sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
fi
if [ ! -e ./zipformer-ctc-asr ]; then
# Note: We use -lc++ to link against libc++ instead of libstdc++
swiftc \
-lc++ \
-I ../build-swift-macos/install/include \
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
./zipformer-ctc-asr.swift ./SherpaOnnx.swift \
-L ../build-swift-macos/install/lib/ \
-l sherpa-onnx \
-l onnxruntime \
-o zipformer-ctc-asr
strip zipformer-ctc-asr
else
echo "./zipformer-ctc-asr exists - skip building"
fi
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
./zipformer-ctc-asr

View File

@@ -0,0 +1,66 @@
import AVFoundation
extension AudioBuffer {
func array() -> [Float] {
return Array(UnsafeBufferPointer(self))
}
}
extension AVAudioPCMBuffer {
func array() -> [Float] {
return self.audioBufferList.pointee.mBuffers.array()
}
}
func run() {
let model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"
let tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"
let zipformerCtc = sherpaOnnxOfflineZipformerCtcModelConfig(
model: model
)
let modelConfig = sherpaOnnxOfflineModelConfig(
tokens: tokens,
debug: 0,
zipformerCtc: zipformerCtc
)
let featConfig = sherpaOnnxFeatureConfig(
sampleRate: 16000,
featureDim: 80
)
var config = sherpaOnnxOfflineRecognizerConfig(
featConfig: featConfig,
modelConfig: modelConfig
)
let recognizer = SherpaOnnxOfflineRecognizer(config: &config)
let filePath = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
let audioFormat = audioFile.processingFormat
assert(audioFormat.channelCount == 1)
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
let audioFrameCount = UInt32(audioFile.length)
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
try! audioFile.read(into: audioFileBuffer!)
let array: [Float]! = audioFileBuffer?.array()
let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate))
print("\nresult is:\n\(result.text)")
if result.timestamps.count != 0 {
print("\ntimestamps is:\n\(result.timestamps)")
}
}
@main
struct App {
static func main() {
run()
}
}

View File

@@ -43,6 +43,10 @@ function freeConfig(config, Module) {
freeConfig(config.dolphin, Module) freeConfig(config.dolphin, Module)
} }
if ('zipformerCtc' in config) {
freeConfig(config.zipformerCtc, Module)
}
if ('moonshine' in config) { if ('moonshine' in config) {
freeConfig(config.moonshine, Module) freeConfig(config.moonshine, Module)
} }
@@ -627,6 +631,23 @@ function initSherpaOnnxOfflineDolphinModelConfig(config, Module) {
} }
} }
function initSherpaOnnxOfflineZipformerCtcModelConfig(config, Module) {
const n = Module.lengthBytesUTF8(config.model || '') + 1;
const buffer = Module._malloc(n);
const len = 1 * 4; // 1 pointer
const ptr = Module._malloc(len);
Module.stringToUTF8(config.model || '', buffer, n);
Module.setValue(ptr, buffer, 'i8*');
return {
buffer: buffer, ptr: ptr, len: len,
}
}
function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
@@ -840,6 +861,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
}; };
} }
if (!('zipformerCtc' in config)) {
config.zipformerCtc = {
model: '',
};
}
if (!('whisper' in config)) { if (!('whisper' in config)) {
config.whisper = { config.whisper = {
encoder: '', encoder: '',
@@ -906,9 +933,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
const dolphin = const dolphin =
initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module); initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module);
const zipformerCtc =
initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module);
const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
dolphin.len; dolphin.len + zipformerCtc.len;
const ptr = Module._malloc(len); const ptr = Module._malloc(len);
@@ -1010,11 +1040,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset); Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset);
offset += dolphin.len; offset += dolphin.len;
Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset);
offset += zipformerCtc.len;
return { return {
buffer: buffer, ptr: ptr, len: len, transducer: transducer, buffer: buffer, ptr: ptr, len: len, transducer: transducer,
paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
dolphin: dolphin dolphin: dolphin, zipformerCtc: zipformerCtc
} }
} }

View File

@@ -13,6 +13,7 @@ extern "C" {
static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, "");
static_assert(sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) == 4, "");
static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, "");
static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, "");
static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, "");
@@ -31,7 +32,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) +
sizeof(SherpaOnnxOfflineMoonshineModelConfig) + sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) +
sizeof(SherpaOnnxOfflineDolphinModelConfig), sizeof(SherpaOnnxOfflineDolphinModelConfig) +
sizeof(SherpaOnnxOfflineZipformerCtcModelConfig),
""); "");
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
@@ -77,6 +79,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
auto moonshine = &model_config->moonshine; auto moonshine = &model_config->moonshine;
auto fire_red_asr = &model_config->fire_red_asr; auto fire_red_asr = &model_config->fire_red_asr;
auto dolphin = &model_config->dolphin; auto dolphin = &model_config->dolphin;
auto zipformer_ctc = &model_config->zipformer_ctc;
fprintf(stdout, "----------offline transducer model config----------\n"); fprintf(stdout, "----------offline transducer model config----------\n");
fprintf(stdout, "encoder: %s\n", transducer->encoder); fprintf(stdout, "encoder: %s\n", transducer->encoder);
@@ -117,6 +120,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
fprintf(stdout, "----------offline Dolphin model config----------\n"); fprintf(stdout, "----------offline Dolphin model config----------\n");
fprintf(stdout, "model: %s\n", dolphin->model); fprintf(stdout, "model: %s\n", dolphin->model);
fprintf(stdout, "----------offline zipformer ctc model config----------\n");
fprintf(stdout, "model: %s\n", zipformer_ctc->model);
fprintf(stdout, "tokens: %s\n", model_config->tokens); fprintf(stdout, "tokens: %s\n", model_config->tokens);
fprintf(stdout, "num_threads: %d\n", model_config->num_threads); fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
fprintf(stdout, "provider: %s\n", model_config->provider); fprintf(stdout, "provider: %s\n", model_config->provider);

View File

@@ -117,6 +117,10 @@ function initOfflineRecognizer() {
}; };
} else if (fileExists('dolphin.onnx')) { } else if (fileExists('dolphin.onnx')) {
config.modelConfig.dolphin = {model: './dolphin.onnx'}; config.modelConfig.dolphin = {model: './dolphin.onnx'};
} else if (fileExists('zipformer-ctc.onnx')) {
// you need to rename model.int8.onnx from zipformer CTC to
// zipformer-ctc.onnx
config.modelConfig.zipformerCtc = {model: './zipformer-ctc.onnx'};
} else { } else {
console.log('Please specify a model.'); console.log('Please specify a model.');
alert('Please specify a model.'); alert('Please specify a model.');