From 3bf986d08d245e1ec482f0c203b85f3d6501c0ed Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 4 Jul 2025 15:57:07 +0800 Subject: [PATCH] Support non-streaming zipformer CTC ASR models (#2340) This PR adds support for non-streaming Zipformer CTC ASR models across multiple language bindings, WebAssembly, examples, and CI workflows. - Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs - Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js - Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models Model doc is available at https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html --- .github/scripts/test-dart.sh | 8 + .github/scripts/test-dot-net.sh | 75 +++--- .github/scripts/test-nodejs-addon-npm.sh | 9 + .github/scripts/test-nodejs-npm.sh | 9 + .github/scripts/test-swift.sh | 3 + .../workflows/aarch64-linux-gnu-shared.yaml | 2 + .../workflows/aarch64-linux-gnu-static.yaml | 1 + .github/workflows/pascal.yaml | 13 +- .github/workflows/run-java-test.yaml | 3 + .github/workflows/test-go.yaml | 4 + .github/workflows/upload-models.yaml | 27 +- README.md | 16 +- cxx-api-examples/CMakeLists.txt | 20 ++ ...mer-ctc-simulate-streaming-alsa-cxx-api.cc | 240 ++++++++++++++++++ ...c-simulate-streaming-microphone-cxx-api.cc | 237 +++++++++++++++++ .../non-streaming-asr/bin/zipformer-ctc.dart | 52 ++++ .../non-streaming-asr/run-zipformer-ctc.sh | 18 ++ .../bin/zipformer-ctc.dart | 118 +++++++++ .../run-zipformer-ctc.sh | 27 ++ .../offline-decode-files/Program.cs | 7 + .../offline-decode-files/run-zipformer-ctc.sh | 18 ++ .../lib/src/offline_recognizer.dart | 33 ++- .../lib/src/sherpa_onnx_bindings.dart | 5 + .../non-streaming-decode-files/main.go | 2 + .../run-zipformer-ctc.sh | 19 ++ .../SherpaOnnxHar/sherpa_onnx/Index.ets | 1 + .../src/main/cpp/non-streaming-asr.cc | 22 +- .../main/ets/components/NonStreamingAsr.ets | 5 + .../NonStreamingDecodeFileZipformerCtc.java | 50 ++++ ...non-streaming-decode-file-zipformer-ctc.sh | 38 +++ kotlin-api-examples/run.sh | 7 + kotlin-api-examples/test_offline_asr.kt | 3 +- nodejs-addon-examples/README.md | 17 ++ .../test_asr_non_streaming_zipformer_ctc.js | 46 ++++ ..._non_streaming_zipformer_ctc_microphone.js | 109 ++++++++ nodejs-examples/README.md | 17 ++ nodejs-examples/test-offline-zipformer-ctc.js | 35 +++ .../non-streaming-asr/.gitignore | 1 + .../non-streaming-asr/run-zipformer-ctc.sh | 43 ++++ .../non-streaming-asr/zipformer_ctc.pas | 76 ++++++ .../vad-with-non-streaming-asr/.gitignore | 2 + .../run-vad-with-zipformer-ctc.sh | 50 ++++ .../vad_with_zipformer_ctc.pas | 135 ++++++++++ .../offline-zipformer-ctc-decode-files.py | 56 ++++ scripts/apk/generate-asr-apk-script.py | 22 +- scripts/apk/generate-vad-asr-apk-script.py | 17 ++ scripts/dotnet/OfflineModelConfig.cs | 2 + .../dotnet/OfflineZipformerCtcModelConfig.cs | 18 ++ .../run-zipformer-ctc.sh | 1 + scripts/go/sherpa_onnx.go | 37 ++- scripts/wasm/generate-vad-asr.py | 15 ++ sherpa-onnx/c-api/c-api.cc | 3 + sherpa-onnx/c-api/c-api.h | 5 + sherpa-onnx/c-api/cxx-api.cc | 3 + sherpa-onnx/c-api/cxx-api.h | 5 + sherpa-onnx/csrc/offline-ctc-model.cc | 20 ++ sherpa-onnx/java-api/Makefile | 1 + .../k2fsa/sherpa/onnx/OfflineModelConfig.java | 18 +- .../onnx/OfflineZipformerCtcModelConfig.java | 32 +++ sherpa-onnx/jni/offline-recognizer.cc | 15 ++ sherpa-onnx/kotlin-api/OfflineRecognizer.kt | 15 ++ sherpa-onnx/kotlin-api/OnlineRecognizer.kt | 2 + sherpa-onnx/pascal-api/sherpa_onnx.pas | 22 +- .../python/sherpa_onnx/offline_recognizer.py | 81 ++++++ swift-api-examples/.gitignore | 3 + swift-api-examples/SherpaOnnx.swift | 15 +- swift-api-examples/run-zipformer-ctc-asr.sh | 43 ++++ swift-api-examples/zipformer-ctc-asr.swift | 66 +++++ wasm/asr/sherpa-onnx-asr.js | 37 ++- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 8 +- wasm/vad-asr/app-vad-asr.js | 4 + 71 files changed, 2121 insertions(+), 68 deletions(-) create mode 100644 cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc create mode 100644 cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc create mode 100644 dart-api-examples/non-streaming-asr/bin/zipformer-ctc.dart create mode 100755 dart-api-examples/non-streaming-asr/run-zipformer-ctc.sh create mode 100644 dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-ctc.dart create mode 100755 dart-api-examples/vad-with-non-streaming-asr/run-zipformer-ctc.sh create mode 100755 dotnet-examples/offline-decode-files/run-zipformer-ctc.sh create mode 100755 go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh create mode 100644 java-api-examples/NonStreamingDecodeFileZipformerCtc.java create mode 100755 java-api-examples/run-non-streaming-decode-file-zipformer-ctc.sh create mode 100644 nodejs-addon-examples/test_asr_non_streaming_zipformer_ctc.js create mode 100644 nodejs-addon-examples/test_vad_asr_non_streaming_zipformer_ctc_microphone.js create mode 100644 nodejs-examples/test-offline-zipformer-ctc.js create mode 100755 pascal-api-examples/non-streaming-asr/run-zipformer-ctc.sh create mode 100644 pascal-api-examples/non-streaming-asr/zipformer_ctc.pas create mode 100755 pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-zipformer-ctc.sh create mode 100644 pascal-api-examples/vad-with-non-streaming-asr/vad_with_zipformer_ctc.pas create mode 100755 python-api-examples/offline-zipformer-ctc-decode-files.py create mode 100644 scripts/dotnet/OfflineZipformerCtcModelConfig.cs create mode 120000 scripts/go/_internal/non-streaming-decode-files/run-zipformer-ctc.sh create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig.java create mode 100755 swift-api-examples/run-zipformer-ctc-asr.sh create mode 100644 swift-api-examples/zipformer-ctc-asr.swift diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 70ff5230..e7130f79 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -6,6 +6,10 @@ cd dart-api-examples pushd non-streaming-asr +echo '----------Zipformer CTC----------' +./run-zipformer-ctc.sh +rm -rf sherpa-onnx-* + echo '----------SenseVoice----------' ./run-sense-voice-with-hr.sh ./run-sense-voice.sh @@ -114,6 +118,10 @@ popd pushd vad-with-non-streaming-asr +echo '----------Zipformer CTC----------' +./run-zipformer-ctc.sh +rm -rf sherpa-onnx-* + echo '----------Dolphin CTC----------' ./run-dolphin-ctc.sh rm -rf sherpa-onnx-* diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index adc85852..6efa11b7 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -6,43 +6,11 @@ cd ./version-test ./run.sh ls -lh -cd ../speech-enhancement-gtcrn -./run.sh -ls -lh - -cd ../kokoro-tts -./run-kokoro.sh -ls -lh - -cd ../offline-tts -./run-matcha-zh.sh -ls -lh *.wav -./run-matcha-en.sh -ls -lh *.wav -./run-aishell3.sh -ls -lh *.wav -./run-piper.sh -ls -lh *.wav -./run-hf-fanchen.sh -ls -lh *.wav -ls -lh - -pushd ../.. - -mkdir tts - -cp -v dotnet-examples/kokoro-tts/*.wav ./tts -cp -v dotnet-examples/offline-tts/*.wav ./tts -popd - -cd ../offline-speaker-diarization -./run.sh -rm -rfv *.onnx -rm -fv *.wav -rm -rfv sherpa-onnx-pyannote-* - cd ../offline-decode-files +./run-zipformer-ctc.sh +rm -rf sherpa-onnx-* + ./run-dolphin-ctc.sh rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02 @@ -82,6 +50,41 @@ rm -rf sherpa-onnx-* ./run-tdnn-yesno.sh rm -rf sherpa-onnx-* +cd ../speech-enhancement-gtcrn +./run.sh +ls -lh + +cd ../kokoro-tts +./run-kokoro.sh +ls -lh + +cd ../offline-tts +./run-matcha-zh.sh +ls -lh *.wav +./run-matcha-en.sh +ls -lh *.wav +./run-aishell3.sh +ls -lh *.wav +./run-piper.sh +ls -lh *.wav +./run-hf-fanchen.sh +ls -lh *.wav +ls -lh + +pushd ../.. + +mkdir tts + +cp -v dotnet-examples/kokoro-tts/*.wav ./tts +cp -v dotnet-examples/offline-tts/*.wav ./tts +popd + +cd ../offline-speaker-diarization +./run.sh +rm -rfv *.onnx +rm -fv *.wav +rm -rfv sherpa-onnx-pyannote-* + cd ../keyword-spotting-from-files ./run.sh @@ -115,5 +118,3 @@ rm -rf sherpa-onnx-* cd ../spoken-language-identification ./run.sh rm -rf sherpa-onnx-* - - diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index 49bd6bbd..ea93abd0 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -10,6 +10,15 @@ arch=$(node -p "require('os').arch()") platform=$(node -p "require('os').platform()") node_version=$(node -p "process.versions.node.split('.')[0]") +echo "----------non-streaming ASR Zipformer CTC----------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +node ./test_asr_non_streaming_zipformer_ctc.js +rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 + echo "----------non-streaming ASR NeMo parakeet tdt----------" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 30620e39..98402bb5 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -9,6 +9,15 @@ git status ls -lh ls -lh node_modules +# asr with offline zipformer ctc +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +node ./test-offline-zipformer-ctc.js +rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 + # asr with offline dolphin ctc curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 diff --git a/.github/scripts/test-swift.sh b/.github/scripts/test-swift.sh index b7c13c73..7e1baf68 100755 --- a/.github/scripts/test-swift.sh +++ b/.github/scripts/test-swift.sh @@ -9,6 +9,9 @@ ls -lh ./run-test-version.sh +./run-zipformer-ctc-asr.sh +rm -rf sherpa-onnx-zipformer-* + ./run-decode-file-sense-voice-with-hr.sh rm -rf sherpa-onnx-sense-voice-* rm -rf dict lexicon.txt replace.fst test-hr.wav diff --git a/.github/workflows/aarch64-linux-gnu-shared.yaml b/.github/workflows/aarch64-linux-gnu-shared.yaml index c9d33535..f10f4265 100644 --- a/.github/workflows/aarch64-linux-gnu-shared.yaml +++ b/.github/workflows/aarch64-linux-gnu-shared.yaml @@ -89,6 +89,7 @@ jobs: make -j4 install cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin + cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin rm -rf install/lib/pkgconfig rm -fv install/lib/cargs.h @@ -135,6 +136,7 @@ jobs: make -j4 install cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin + cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin rm -rf install/lib/pkgconfig rm -fv install/lib/cargs.h diff --git a/.github/workflows/aarch64-linux-gnu-static.yaml b/.github/workflows/aarch64-linux-gnu-static.yaml index 3526912d..e8a639e2 100644 --- a/.github/workflows/aarch64-linux-gnu-static.yaml +++ b/.github/workflows/aarch64-linux-gnu-static.yaml @@ -90,6 +90,7 @@ jobs: make install cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin + cp bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin ls -lh install/lib diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index d45dbe3b..7a07bce3 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -37,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest, macos-13, windows-latest] + os: [ubuntu-latest, macos-latest, macos-13, windows-latest, ubuntu-22.04-arm] steps: - uses: actions/checkout@v4 @@ -56,7 +56,7 @@ jobs: key: ${{ matrix.os }} - name: Install Free pascal compiler (ubuntu) - if: matrix.os == 'ubuntu-latest' + if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-22.04-arm' shell: bash run: | sudo apt-get update @@ -156,6 +156,10 @@ jobs: pushd non-streaming-asr + ./run-zipformer-ctc.sh + rm -rf sherpa-onnx-* + echo "---" + ./run-dolphin-ctc.sh rm -rf sherpa-onnx-* echo "---" @@ -264,9 +268,12 @@ jobs: cd ./pascal-api-examples - pushd vad-with-non-streaming-asr + time ./run-vad-with-zipformer-ctc.sh + rm -rf sherpa-onnx-* + echo "---" + time ./run-vad-with-dolphin-ctc.sh rm -rf sherpa-onnx-* echo "---" diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index 73ec5e85..fd2494d7 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -165,6 +165,9 @@ jobs: run: | cd ./java-api-examples + ./run-non-streaming-decode-file-zipformer-ctc.sh + rm -rf sherpa-onnx-zipformer-ctc-* + ./run-non-streaming-decode-file-dolphin-ctc.sh rm -rf sherpa-onnx-dolphin-* diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index afae7675..18df2474 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -184,6 +184,10 @@ jobs: go build ls -lh + echo "Test Zipformer CTC" + ./run-zipformer-ctc.sh + rm -rf sherpa-onnx-zipformer-* + echo "Test SenseVoice ctc" ./run-sense-voice-small-with-hr.sh ./run-sense-voice-small.sh diff --git a/.github/workflows/upload-models.yaml b/.github/workflows/upload-models.yaml index eebeae9c..9b7dafcb 100644 --- a/.github/workflows/upload-models.yaml +++ b/.github/workflows/upload-models.yaml @@ -19,12 +19,36 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8"] + python-version: ["3.10"] steps: - uses: actions/checkout@v4 + - name: Zipformer CTC (non-streaming) + shell: bash + run: | + git lfs install + names=( + sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 + sherpa-onnx-zipformer-ctc-zh-2025-07-03 + sherpa-onnx-zipformer-ctc-zh-fp16-2025-07-03 + ) + for name in ${names[@]}; do + git clone https://huggingface.co/csukuangfj/$name + pushd $name + git lfs pull + rm -rf .git + rm -rfv .gitattributes + ls -lh + popd + + tar cjfv $name.tar.bz2 $name + rm -rf $name + ls -lh *.tar.bz2 + done + - name: Vietnamese (zipformer) + if: false shell: bash run: | rm -rf models @@ -76,6 +100,7 @@ jobs: mv models/* . - name: Publish to huggingface (Vietnamese zipformer) + if: false env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 diff --git a/README.md b/README.md index e2bbfd97..931a7e3e 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ We also have spaces built using WebAssembly. They are listed below: |Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| |Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| |Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| +|VAD + speech recognition (Chinese) with [Zipformer CTC](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|[Click me][wasm-hf-vad-asr-zh-zipformer-ctc-07-03]| [地址][wasm-ms-vad-asr-zh-zipformer-ctc-07-03]| |VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| |VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| |VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]| @@ -141,6 +142,7 @@ We also have spaces built using WebAssembly. They are listed below: |----------------------------------------|------------------------------------|-----------------------------------| | Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]| | Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] | +| Simulated-streaming speech recognition | [Address][apk-simula-streaming-asr]| [点此][apk-simula-streaming-asr-cn]| | Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | | Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | | VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | @@ -250,8 +252,10 @@ for more models. The following table lists only **SOME** of them. |Name | Supported Languages| Description| |-----|-----|----| +|[sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english)| English | It is converted from | |[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)| |[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)| +|[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|Chinese| A Zipformer CTC model| |[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)| |[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)| |[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)| @@ -413,6 +417,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. [wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en [wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en [SenseVoice]: https://github.com/FunAudioLLM/SenseVoice +[wasm-hf-vad-asr-zh-zipformer-ctc-07-03]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc +[wasm-ms-vad-asr-zh-zipformer-ctc-07-03]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc/summary [wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice [wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice [wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny @@ -423,20 +429,20 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. [wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech [wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech [wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech -[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf +[reazonspeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf [wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer [wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer -[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2 +[gigaspeech2]: https://github.com/speechcolab/gigaspeech2 [wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer [wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer -[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR +[telespeech-asr]: https://github.com/tele-ai/telespeech-asr [wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech [wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech [wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer [wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer [wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small [wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small -[Dolphin]: https://github.com/DataoceanAI/Dolphin +[dolphin]: https://github.com/dataoceanai/dolphin [wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc [wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc @@ -450,6 +456,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech. [apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html +[apk-simula-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr.html +[apk-simula-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr-cn.html [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html [apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html [apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html diff --git a/cxx-api-examples/CMakeLists.txt b/cxx-api-examples/CMakeLists.txt index 0c0a8d24..4b40d228 100644 --- a/cxx-api-examples/CMakeLists.txt +++ b/cxx-api-examples/CMakeLists.txt @@ -45,6 +45,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO) sherpa-onnx-cxx-api portaudio_static ) + + add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api + ./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc + ) + target_link_libraries(zipformer-ctc-simulate-streaming-microphone-cxx-api + sherpa-onnx-cxx-api + portaudio_static + ) endif() if(SHERPA_ONNX_HAS_ALSA) @@ -57,10 +66,21 @@ if(SHERPA_ONNX_HAS_ALSA) portaudio_static ) + add_executable(zipformer-ctc-simulate-streaming-alsa-cxx-api + ./zipformer-ctc-simulate-streaming-alsa-cxx-api.cc + ${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc + ) + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api + sherpa-onnx-cxx-api + portaudio_static + ) + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) else() target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound) + target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api asound) endif() endif() diff --git a/cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc b/cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc new file mode 100644 index 00000000..c255ec6a --- /dev/null +++ b/cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc @@ -0,0 +1,240 @@ +// cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use zipformer CTC with sherpa-onnx's C++ API +// for streaming speech recognition from a microphone. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "sherpa-display.h" // NOLINT +#include "sherpa-onnx/c-api/cxx-api.h" +#include "sherpa-onnx/csrc/alsa.h" + +std::queue> samples_queue; +std::condition_variable condition_variable; +std::mutex mutex; +bool stop = false; + +static void Handler(int32_t /*sig*/) { + stop = true; + condition_variable.notify_one(); + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +static void RecordCallback(sherpa_onnx::Alsa *alsa) { + int32_t chunk = 0.1 * alsa->GetActualSampleRate(); + while (!stop) { + std::vector samples = alsa->Read(chunk); + + std::lock_guard lock(mutex); + samples_queue.emplace(std::move(samples)); + condition_variable.notify_one(); + } +} + +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { + using namespace sherpa_onnx::cxx; // NOLINT + VadModelConfig config; + config.silero_vad.model = "./silero_vad.onnx"; + config.silero_vad.threshold = 0.5; + config.silero_vad.min_silence_duration = 0.1; + config.silero_vad.min_speech_duration = 0.25; + config.silero_vad.max_speech_duration = 8; + config.sample_rate = 16000; + config.debug = false; + + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); + if (!vad.Get()) { + std::cerr << "Failed to create VAD. Please check your config\n"; + exit(-1); + } + + return vad; +} + +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.zipformer_ctc.model = + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"; + config.model_config.tokens = + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"; + + config.model_config.num_threads = 2; + config.model_config.debug = false; + + std::cout << "Loading model\n"; + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); + if (!recognizer.Get()) { + std::cerr << "Please check your config\n"; + exit(-1); + } + std::cout << "Loading model done\n"; + return recognizer; +} + +int32_t main(int32_t argc, const char *argv[]) { + const char *kUsageMessage = R"usage( +Usage: + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +./zipformer-ctc-simulate-streaming-alsa-cxx-api device_name + +The device name specifies which microphone to use in case there are several +on your system. You can use + + arecord -l + +to find all available microphones on your computer. For instance, if it outputs + +**** List of CAPTURE Hardware Devices **** +card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 + +and if you want to select card 3 and device 0 on that card, please use: + + plughw:3,0 + +as the device_name. +)usage"; + + if (argc != 2) { + fprintf(stderr, "%s\n", kUsageMessage); + return -1; + } + + signal(SIGINT, Handler); + + using namespace sherpa_onnx::cxx; // NOLINT + + auto vad = CreateVad(); + auto recognizer = CreateOfflineRecognizer(); + + int32_t expected_sample_rate = 16000; + + std::string device_name = argv[1]; + sherpa_onnx::Alsa alsa(device_name.c_str()); + fprintf(stderr, "Use recording device: %s\n", device_name.c_str()); + + if (alsa.GetExpectedSampleRate() != expected_sample_rate) { + fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(), + expected_sample_rate); + exit(-1); + } + + int32_t window_size = 512; // samples, please don't change + + int32_t offset = 0; + std::vector buffer; + bool speech_started = false; + + auto started_time = std::chrono::steady_clock::now(); + + SherpaDisplay display; + + std::thread record_thread(RecordCallback, &alsa); + + std::cout << "Started! Please speak\n"; + + while (!stop) { + { + std::unique_lock lock(mutex); + while (samples_queue.empty() && !stop) { + condition_variable.wait(lock); + } + + const auto &s = samples_queue.front(); + buffer.insert(buffer.end(), s.begin(), s.end()); + + samples_queue.pop(); + } + + for (; offset + window_size < buffer.size(); offset += window_size) { + vad.AcceptWaveform(buffer.data() + offset, window_size); + if (!speech_started && vad.IsDetected()) { + speech_started = true; + started_time = std::chrono::steady_clock::now(); + } + } + if (!speech_started) { + if (buffer.size() > 10 * window_size) { + offset -= buffer.size() - 10 * window_size; + buffer = {buffer.end() - 10 * window_size, buffer.end()}; + } + } + + auto current_time = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(current_time - + started_time) + .count() / + 1000.; + + if (speech_started && elapsed_seconds > 0.2) { + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + display.UpdateText(result.text); + display.Display(); + + started_time = std::chrono::steady_clock::now(); + } + + while (!vad.IsEmpty()) { + auto segment = vad.Front(); + + vad.Pop(); + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(expected_sample_rate, segment.samples.data(), + segment.samples.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + + display.UpdateText(result.text); + display.FinalizeCurrentSentence(); + display.Display(); + + buffer.clear(); + offset = 0; + speech_started = false; + } + } + + record_thread.join(); + + return 0; +} diff --git a/cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc b/cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc new file mode 100644 index 00000000..f7eb117e --- /dev/null +++ b/cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc @@ -0,0 +1,237 @@ +// cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc +// Copyright (c) 2025 Xiaomi Corporation + +// +// This file demonstrates how to use Zipformer CTC with sherpa-onnx's C++ API +// for streaming speech recognition from a microphone. +// +// clang-format off +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +// +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +// +// clang-format on + +#include +#include +#include + +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "portaudio.h" // NOLINT +#include "sherpa-display.h" // NOLINT +#include "sherpa-onnx/c-api/cxx-api.h" +#include "sherpa-onnx/csrc/microphone.h" + +std::queue> samples_queue; +std::condition_variable condition_variable; +std::mutex mutex; +bool stop = false; + +static void Handler(int32_t /*sig*/) { + stop = true; + condition_variable.notify_one(); + fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n"); +} + +static int32_t RecordCallback(const void *input_buffer, + void * /*output_buffer*/, + unsigned long frames_per_buffer, // NOLINT + const PaStreamCallbackTimeInfo * /*time_info*/, + PaStreamCallbackFlags /*status_flags*/, + void * /*user_data*/) { + std::lock_guard lock(mutex); + samples_queue.emplace( + reinterpret_cast(input_buffer), + reinterpret_cast(input_buffer) + frames_per_buffer); + condition_variable.notify_one(); + + return stop ? paComplete : paContinue; +} + +static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() { + using namespace sherpa_onnx::cxx; // NOLINT + VadModelConfig config; + config.silero_vad.model = "./silero_vad.onnx"; + config.silero_vad.threshold = 0.5; + config.silero_vad.min_silence_duration = 0.1; + config.silero_vad.min_speech_duration = 0.25; + config.silero_vad.max_speech_duration = 8; + config.sample_rate = 16000; + config.debug = false; + + VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20); + if (!vad.Get()) { + std::cerr << "Failed to create VAD. Please check your config\n"; + exit(-1); + } + + return vad; +} + +static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() { + using namespace sherpa_onnx::cxx; // NOLINT + OfflineRecognizerConfig config; + + config.model_config.zipformer_ctc.model = + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"; + config.model_config.tokens = + "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"; + + config.model_config.num_threads = 2; + config.model_config.debug = false; + + std::cout << "Loading model\n"; + OfflineRecognizer recognizer = OfflineRecognizer::Create(config); + if (!recognizer.Get()) { + std::cerr << "Please check your config\n"; + exit(-1); + } + std::cout << "Loading model done\n"; + return recognizer; +} + +int32_t main() { + signal(SIGINT, Handler); + + using namespace sherpa_onnx::cxx; // NOLINT + + auto vad = CreateVad(); + auto recognizer = CreateOfflineRecognizer(); + + sherpa_onnx::Microphone mic; + + PaDeviceIndex num_devices = Pa_GetDeviceCount(); + if (num_devices == 0) { + std::cerr << " If you are using Linux, please try " + "./build/bin/zipformer-ctc-simulate-streaming-alsa-cxx-api\n"; + return -1; + } + + int32_t device_index = Pa_GetDefaultInputDevice(); + const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE"); + if (pDeviceIndex) { + fprintf(stderr, "Use specified device: %s\n", pDeviceIndex); + device_index = atoi(pDeviceIndex); + } + mic.PrintDevices(device_index); + + float mic_sample_rate = 16000; + const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE"); + if (sample_rate_str) { + fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate); + mic_sample_rate = atof(sample_rate_str); + } + float sample_rate = 16000; + LinearResampler resampler; + if (mic_sample_rate != sample_rate) { + float min_freq = std::min(mic_sample_rate, sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler = LinearResampler::Create(mic_sample_rate, sample_rate, + lowpass_cutoff, lowpass_filter_width); + } + if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback, + nullptr) == false) { + std::cerr << "Failed to open microphone device\n"; + return -1; + } + + int32_t window_size = 512; // samples, please don't change + + int32_t offset = 0; + std::vector buffer; + bool speech_started = false; + + auto started_time = std::chrono::steady_clock::now(); + + SherpaDisplay display; + + std::cout << "Started! Please speak\n"; + + while (!stop) { + { + std::unique_lock lock(mutex); + while (samples_queue.empty() && !stop) { + condition_variable.wait(lock); + } + + const auto &s = samples_queue.front(); + if (!resampler.Get()) { + buffer.insert(buffer.end(), s.begin(), s.end()); + } else { + auto resampled = resampler.Resample(s.data(), s.size(), false); + buffer.insert(buffer.end(), resampled.begin(), resampled.end()); + } + + samples_queue.pop(); + } + + for (; offset + window_size < buffer.size(); offset += window_size) { + vad.AcceptWaveform(buffer.data() + offset, window_size); + if (!speech_started && vad.IsDetected()) { + speech_started = true; + started_time = std::chrono::steady_clock::now(); + } + } + if (!speech_started) { + if (buffer.size() > 10 * window_size) { + offset -= buffer.size() - 10 * window_size; + buffer = {buffer.end() - 10 * window_size, buffer.end()}; + } + } + + auto current_time = std::chrono::steady_clock::now(); + const float elapsed_seconds = + std::chrono::duration_cast(current_time - + started_time) + .count() / + 1000.; + + if (speech_started && elapsed_seconds > 0.2) { + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + display.UpdateText(result.text); + display.Display(); + + started_time = std::chrono::steady_clock::now(); + } + + while (!vad.IsEmpty()) { + auto segment = vad.Front(); + + vad.Pop(); + + OfflineStream stream = recognizer.CreateStream(); + stream.AcceptWaveform(sample_rate, segment.samples.data(), + segment.samples.size()); + + recognizer.Decode(&stream); + + OfflineRecognizerResult result = recognizer.GetResult(&stream); + + display.UpdateText(result.text); + display.FinalizeCurrentSentence(); + display.Display(); + + buffer.clear(); + offset = 0; + speech_started = false; + } + } + + return 0; +} diff --git a/dart-api-examples/non-streaming-asr/bin/zipformer-ctc.dart b/dart-api-examples/non-streaming-asr/bin/zipformer-ctc.dart new file mode 100644 index 00000000..9cdf1498 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/zipformer-ctc.dart @@ -0,0 +1,52 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the Zipformer CTC model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + zipformerCtc: zipformerCtc, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/run-zipformer-ctc.sh b/dart-api-examples/non-streaming-asr/run-zipformer-ctc.sh new file mode 100755 index 00000000..a6a495f3 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-zipformer-ctc.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +fi + +dart run \ + ./bin/zipformer-ctc.dart \ + --model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ + --input-wav ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav diff --git a/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-ctc.dart b/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-ctc.dart new file mode 100644 index 00000000..8124396c --- /dev/null +++ b/dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-ctc.dart @@ -0,0 +1,118 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('silero-vad', help: 'Path to silero_vad.onnx') + ..addOption('model', help: 'Path to the Zipformer CTC model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['silero-vad'] == null || + res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + // create VAD + final sileroVad = res['silero-vad'] as String; + + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig( + model: sileroVad, + minSilenceDuration: 0.25, + minSpeechDuration: 0.5, + maxSpeechDuration: 5.0, + ); + + final vadConfig = sherpa_onnx.VadModelConfig( + sileroVad: sileroVadConfig, + numThreads: 1, + debug: true, + ); + + final vad = sherpa_onnx.VoiceActivityDetector( + config: vadConfig, bufferSizeInSeconds: 10); + + // create offline recognizer + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + zipformerCtc: zipformerCtc, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + if (waveData.sampleRate != 16000) { + print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}'); + exit(1); + } + + int numSamples = waveData.samples.length; + int numIter = numSamples ~/ vadConfig.sileroVad.windowSize; + + for (int i = 0; i != numIter; ++i) { + int start = i * vadConfig.sileroVad.windowSize; + vad.acceptWaveform(Float32List.sublistView( + waveData.samples, start, start + vadConfig.sileroVad.windowSize)); + + while (!vad.isEmpty()) { + final samples = vad.front().samples; + final startTime = vad.front().start.toDouble() / waveData.sampleRate; + final endTime = + startTime + samples.length.toDouble() / waveData.sampleRate; + + final stream = recognizer.createStream(); + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + stream.free(); + print( + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); + + vad.pop(); + } + } + + vad.flush(); + + while (!vad.isEmpty()) { + final samples = vad.front().samples; + final startTime = vad.front().start.toDouble() / waveData.sampleRate; + final endTime = startTime + samples.length.toDouble() / waveData.sampleRate; + + final stream = recognizer.createStream(); + stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + stream.free(); + print( + '${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}'); + + vad.pop(); + } + + vad.free(); + + recognizer.free(); +} diff --git a/dart-api-examples/vad-with-non-streaming-asr/run-zipformer-ctc.sh b/dart-api-examples/vad-with-non-streaming-asr/run-zipformer-ctc.sh new file mode 100755 index 00000000..f474d829 --- /dev/null +++ b/dart-api-examples/vad-with-non-streaming-asr/run-zipformer-ctc.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +dart run \ + ./bin/zipformer-ctc.dart \ + --silero-vad ./silero_vad.onnx \ + --model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ + --input-wav ./lei-jun-test.wav diff --git a/dotnet-examples/offline-decode-files/Program.cs b/dotnet-examples/offline-decode-files/Program.cs index a13d2ed6..e6d7798f 100644 --- a/dotnet-examples/offline-decode-files/Program.cs +++ b/dotnet-examples/offline-decode-files/Program.cs @@ -75,6 +75,9 @@ class OfflineDecodeFiles [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] public string NeMoCtc { get; set; } = string.Empty; + [Option("zipformer-ctc", Required = false, HelpText = "Path to model.onnx. Used only for Zipformer CTC models")] + public string ZipformerCtc { get; set; } = string.Empty; + [Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")] public string DolphinModel { get; set; } = string.Empty; @@ -240,6 +243,10 @@ to download pre-trained Tdnn models. { config.ModelConfig.Dolphin.Model = options.DolphinModel; } + else if (!string.IsNullOrEmpty(options.ZipformerCtc)) + { + config.ModelConfig.ZipformerCtc.Model = options.ZipformerCtc; + } else if (!string.IsNullOrEmpty(options.TeleSpeechCtc)) { config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; diff --git a/dotnet-examples/offline-decode-files/run-zipformer-ctc.sh b/dotnet-examples/offline-decode-files/run-zipformer-ctc.sh new file mode 100755 index 00000000..9ca6db85 --- /dev/null +++ b/dotnet-examples/offline-decode-files/run-zipformer-ctc.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +fi + +dotnet run \ + --tokens=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ + --zipformer-ctc=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ + --num-threads=1 \ + --files ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav \ + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/1.wav \ + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/8k.wav diff --git a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart index 5e368ae0..892e2b02 100644 --- a/flutter/sherpa_onnx/lib/src/offline_recognizer.dart +++ b/flutter/sherpa_onnx/lib/src/offline_recognizer.dart @@ -104,6 +104,27 @@ class OfflineDolphinModelConfig { final String model; } +class OfflineZipformerCtcModelConfig { + const OfflineZipformerCtcModelConfig({this.model = ''}); + + factory OfflineZipformerCtcModelConfig.fromJson(Map json) { + return OfflineZipformerCtcModelConfig( + model: json['model'] as String? ?? '', + ); + } + + @override + String toString() { + return 'OfflineZipformerCtcModelConfig(model: $model)'; + } + + Map toJson() => { + 'model': model, + }; + + final String model; +} + class OfflineWhisperModelConfig { const OfflineWhisperModelConfig( {this.encoder = '', @@ -288,6 +309,7 @@ class OfflineModelConfig { this.moonshine = const OfflineMoonshineModelConfig(), this.fireRedAsr = const OfflineFireRedAsrModelConfig(), this.dolphin = const OfflineDolphinModelConfig(), + this.zipformerCtc = const OfflineZipformerCtcModelConfig(), required this.tokens, this.numThreads = 1, this.debug = true, @@ -336,6 +358,10 @@ class OfflineModelConfig { ? OfflineDolphinModelConfig.fromJson( json['dolphin'] as Map) : const OfflineDolphinModelConfig(), + zipformerCtc: json['zipformerCtc'] != null + ? OfflineZipformerCtcModelConfig.fromJson( + json['zipformerCtc'] as Map) + : const OfflineZipformerCtcModelConfig(), tokens: json['tokens'] as String, numThreads: json['numThreads'] as int? ?? 1, debug: json['debug'] as bool? ?? true, @@ -349,7 +375,7 @@ class OfflineModelConfig { @override String toString() { - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, zipformerCtc: $zipformerCtc, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; } Map toJson() => { @@ -362,6 +388,7 @@ class OfflineModelConfig { 'moonshine': moonshine.toJson(), 'fireRedAsr': fireRedAsr.toJson(), 'dolphin': dolphin.toJson(), + 'zipformerCtc': zipformerCtc.toJson(), 'tokens': tokens, 'numThreads': numThreads, 'debug': debug, @@ -381,6 +408,7 @@ class OfflineModelConfig { final OfflineMoonshineModelConfig moonshine; final OfflineFireRedAsrModelConfig fireRedAsr; final OfflineDolphinModelConfig dolphin; + final OfflineZipformerCtcModelConfig zipformerCtc; final String tokens; final int numThreads; @@ -578,6 +606,8 @@ class OfflineRecognizer { config.model.fireRedAsr.decoder.toNativeUtf8(); c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8(); + c.ref.model.zipformerCtc.model = + config.model.zipformerCtc.model.toNativeUtf8(); c.ref.model.tokens = config.model.tokens.toNativeUtf8(); @@ -623,6 +653,7 @@ class OfflineRecognizer { calloc.free(c.ref.model.modelType); calloc.free(c.ref.model.provider); calloc.free(c.ref.model.tokens); + calloc.free(c.ref.model.zipformerCtc.model); calloc.free(c.ref.model.dolphin.model); calloc.free(c.ref.model.fireRedAsr.decoder); calloc.free(c.ref.model.fireRedAsr.encoder); diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 48977f29..0db40397 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -266,6 +266,10 @@ final class SherpaOnnxOfflineDolphinModelConfig extends Struct { external Pointer model; } +final class SherpaOnnxOfflineZipformerCtcModelConfig extends Struct { + external Pointer model; +} + final class SherpaOnnxOfflineWhisperModelConfig extends Struct { external Pointer encoder; external Pointer decoder; @@ -333,6 +337,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct { external SherpaOnnxOfflineMoonshineModelConfig moonshine; external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr; external SherpaOnnxOfflineDolphinModelConfig dolphin; + external SherpaOnnxOfflineZipformerCtcModelConfig zipformerCtc; } final class SherpaOnnxOfflineRecognizerConfig extends Struct { diff --git a/go-api-examples/non-streaming-decode-files/main.go b/go-api-examples/non-streaming-decode-files/main.go index dc9bbb1a..55c1864e 100644 --- a/go-api-examples/non-streaming-decode-files/main.go +++ b/go-api-examples/non-streaming-decode-files/main.go @@ -28,6 +28,8 @@ func main() { flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model") + flag.StringVar(&config.ModelConfig.ZipformerCtc.Model, "zipformer-ctc", "", "Path to the Zipformer CTC model") + flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model") flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model") diff --git a/go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh b/go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh new file mode 100755 index 00000000..0c2af926 --- /dev/null +++ b/go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +fi + +go mod tidy +go build + +./non-streaming-decode-files \ + --zipformer-ctc ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \ + --tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \ + --debug 0 \ + ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets index 72e764cf..c165d416 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets @@ -15,6 +15,7 @@ export { Samples, OfflineTdnnModelConfig, OfflineSenseVoiceModelConfig, OfflineMoonshineModelConfig, + OfflineZipformerCtcModelConfig, OfflineModelConfig, OfflineLMConfig, OfflineRecognizerConfig, diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc index ed5afc2d..fb86615c 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/cpp/non-streaming-asr.cc @@ -45,7 +45,23 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig( return c; } -static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinfig( +static SherpaOnnxOfflineZipformerCtcModelConfig +GetOfflineZipformerCtcModelConfig(Napi::Object obj) { + SherpaOnnxOfflineZipformerCtcModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("zipformerCtc") || !obj.Get("zipformerCtc").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("zipformerCtc").As(); + + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); + + return c; +} + +static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinModelConfig( Napi::Object obj) { SherpaOnnxOfflineDolphinModelConfig c; memset(&c, 0, sizeof(c)); @@ -185,7 +201,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { c.sense_voice = GetOfflineSenseVoiceModelConfig(o); c.moonshine = GetOfflineMoonshineModelConfig(o); c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o); - c.dolphin = GetOfflineDolphinfig(o); + c.dolphin = GetOfflineDolphinModelConfig(o); + c.zipformer_ctc = GetOfflineZipformerCtcModelConfig(o); SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens); SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); @@ -312,6 +329,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder); SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model); + SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model); SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens); SHERPA_ONNX_DELETE_C_STR(c.model_config.provider); diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets index 492f7d5c..b437a163 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingAsr.ets @@ -55,6 +55,10 @@ export class OfflineDolphinModelConfig { public model: string = ''; } +export class OfflineZipformerCtcModelConfig { + public model: string = ''; +} + export class OfflineWhisperModelConfig { public encoder: string = ''; public decoder: string = ''; @@ -97,6 +101,7 @@ export class OfflineModelConfig { public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig(); public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig(); public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig(); + public zipformerCtc: OfflineZipformerCtcModelConfig = new OfflineZipformerCtcModelConfig(); } export class OfflineLMConfig { diff --git a/java-api-examples/NonStreamingDecodeFileZipformerCtc.java b/java-api-examples/NonStreamingDecodeFileZipformerCtc.java new file mode 100644 index 00000000..f66f3ad6 --- /dev/null +++ b/java-api-examples/NonStreamingDecodeFileZipformerCtc.java @@ -0,0 +1,50 @@ +// Copyright 2025 Xiaomi Corporation + +// This file shows how to use an offline Zipformer CTC model, +// i.e., non-streaming Zipformer CTC model, +// to decode files. +import com.k2fsa.sherpa.onnx.*; + +public class NonStreamingDecodeFileZipformerCtc { + public static void main(String[] args) { + // please refer to + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + // to download model files + String model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"; + String tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"; + + String waveFilename = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"; + + WaveReader reader = new WaveReader(waveFilename); + + OfflineZipformerCtcModelConfig zipformerCtc = + OfflineZipformerCtcModelConfig.builder().setModel(model).build(); + + OfflineModelConfig modelConfig = + OfflineModelConfig.builder() + .setZipformerCtc(zipformerCtc) + .setTokens(tokens) + .setNumThreads(1) + .setDebug(true) + .build(); + + OfflineRecognizerConfig config = + OfflineRecognizerConfig.builder() + .setOfflineModelConfig(modelConfig) + .setDecodingMethod("greedy_search") + .build(); + + OfflineRecognizer recognizer = new OfflineRecognizer(config); + OfflineStream stream = recognizer.createStream(); + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); + + recognizer.decode(stream); + + String text = recognizer.getResult(stream).getText(); + + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); + + stream.release(); + recognizer.release(); + } +} diff --git a/java-api-examples/run-non-streaming-decode-file-zipformer-ctc.sh b/java-api-examples/run-non-streaming-decode-file-zipformer-ctc.sh new file mode 100755 index 00000000..e99cf39b --- /dev/null +++ b/java-api-examples/run-non-streaming-decode-file-zipformer-ctc.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + NonStreamingDecodeFileZipformerCtc.java diff --git a/kotlin-api-examples/run.sh b/kotlin-api-examples/run.sh index 1cc4e64f..ef3fb091 100755 --- a/kotlin-api-examples/run.sh +++ b/kotlin-api-examples/run.sh @@ -253,6 +253,13 @@ function testOfflineAsr() { rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2 fi + if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + fi + out_filename=test_offline_asr.jar kotlinc-jvm -include-runtime -d $out_filename \ test_offline_asr.kt \ diff --git a/kotlin-api-examples/test_offline_asr.kt b/kotlin-api-examples/test_offline_asr.kt index a6e29148..6cee007e 100644 --- a/kotlin-api-examples/test_offline_asr.kt +++ b/kotlin-api-examples/test_offline_asr.kt @@ -1,7 +1,7 @@ package com.k2fsa.sherpa.onnx fun main() { - val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25) + val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25, 31) for (type in types) { test(type) } @@ -19,6 +19,7 @@ fun test(type: Int) { 21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav" 24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav" 25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav" + 31 -> "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav" else -> null } diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index 88de0d0e..b4874a39 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -123,6 +123,7 @@ The following tables list the examples in this folder. |[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)| |[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)| |[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| +|[./test_asr_non_streaming_zipformer_ctc.js](./test_asr_non_streaming_zipformer_ctc.js)|Non-streaming speech recognition from a file using a Zipformer CTC model with greedy search| |[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search| |[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search| |[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| @@ -137,6 +138,7 @@ The following tables list the examples in this folder. |[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)| |[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)| |[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search| +|[./test_vad_asr_non_streaming_zipformer_ctc_microphone.js](./test_vad_asr_non_streaming_zipformer_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer CTC model with greedy search| |[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)| |[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)| @@ -372,6 +374,21 @@ rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2 node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js ``` +### Non-streaming speech recognition with Zipformer CTC models + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +node ./test_asr_non_streaming_zipformer_ctc.js + +# To run VAD + non-streaming ASR with Paraformer using a microphone +npm install naudiodon2 +node ./test_vad_asr_non_streaming_zipformer_ctc_microphone.js +``` + ### Non-streaming speech recognition with NeMo CTC models ```bash diff --git a/nodejs-addon-examples/test_asr_non_streaming_zipformer_ctc.js b/nodejs-addon-examples/test_asr_non_streaming_zipformer_ctc.js new file mode 100644 index 00000000..3e5b25e9 --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_zipformer_ctc.js @@ -0,0 +1,46 @@ +// Copyright (c) 2025 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'zipformerCtc': { + 'model': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx', + }, + 'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } +}; + +const waveFilename = + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = Date.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'seconds') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_vad_asr_non_streaming_zipformer_ctc_microphone.js b/nodejs-addon-examples/test_vad_asr_non_streaming_zipformer_ctc_microphone.js new file mode 100644 index 00000000..fab622f9 --- /dev/null +++ b/nodejs-addon-examples/test_vad_asr_non_streaming_zipformer_ctc_microphone.js @@ -0,0 +1,109 @@ +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createRecognizer() { + // Please download test files from + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'zipformerCtc': { + 'model': + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx', + }, + 'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + } + }; + + return new sherpa_onnx.OfflineRecognizer(config); +} + +function createVad() { + // please download silero_vad.onnx from + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx + const config = { + sileroVad: { + model: './silero_vad.onnx', + threshold: 0.5, + minSpeechDuration: 0.25, + minSilenceDuration: 0.5, + windowSize: 512, + }, + sampleRate: 16000, + debug: true, + numThreads: 1, + }; + + const bufferSizeInSeconds = 60; + + return new sherpa_onnx.Vad(config, bufferSizeInSeconds); +} + +const recognizer = createRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples); + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform({ + samples: segment.samples, + sampleRate: recognizer.config.featConfig.sampleRate + }); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + if (r.text.length > 0) { + const text = r.text.toLowerCase().trim(); + console.log(`${index}: ${text}`); + + const filename = `${index}-${text}-${ + new Date() + .toLocaleTimeString('en-US', {hour12: false}) + .split(' ')[0]}.wav`; + sherpa_onnx.writeWave( + filename, + {samples: segment.samples, sampleRate: vad.config.sampleRate}); + + index += 1; + } + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 31af5cc2..02df62f0 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -154,6 +154,23 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2 node ./test-offline-dolphin-ctc.js ``` +## ./test-offline-zipformer-ctc.js + +[./test-offline-zipformer-ctc.js](./test-offline-zipformer-ctc.js) demonstrates +how to decode a file with a Zipformer CTC model. In the code we use +[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese). + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + +node ./test-offline-zipformer-ctc.js +``` + ## ./test-offline-nemo-ctc.js [./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates diff --git a/nodejs-examples/test-offline-zipformer-ctc.js b/nodejs-examples/test-offline-zipformer-ctc.js new file mode 100644 index 00000000..22984570 --- /dev/null +++ b/nodejs-examples/test-offline-zipformer-ctc.js @@ -0,0 +1,35 @@ +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineRecognizer() { + let config = { + modelConfig: { + zipformerCtc: { + model: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx', + }, + tokens: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt', + } + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + +const recognizer = createOfflineRecognizer(); +const stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav'; +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform(wave.sampleRate, wave.samples); + +recognizer.decode(stream); +const text = recognizer.getResult(stream).text; +console.log(text); + +stream.free(); +recognizer.free(); diff --git a/pascal-api-examples/non-streaming-asr/.gitignore b/pascal-api-examples/non-streaming-asr/.gitignore index 8b3ee37d..5e1c13eb 100644 --- a/pascal-api-examples/non-streaming-asr/.gitignore +++ b/pascal-api-examples/non-streaming-asr/.gitignore @@ -9,3 +9,4 @@ sense_voice telespeech_ctc moonshine dolphin_ctc +zipformer_ctc diff --git a/pascal-api-examples/non-streaming-asr/run-zipformer-ctc.sh b/pascal-api-examples/non-streaming-asr/run-zipformer-ctc.sh new file mode 100755 index 00000000..841fc100 --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/run-zipformer-ctc.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + ls -lh lib + popd +fi + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./zipformer_ctc.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./zipformer_ctc diff --git a/pascal-api-examples/non-streaming-asr/zipformer_ctc.pas b/pascal-api-examples/non-streaming-asr/zipformer_ctc.pas new file mode 100644 index 00000000..e54c5a7f --- /dev/null +++ b/pascal-api-examples/non-streaming-asr/zipformer_ctc.pas @@ -0,0 +1,76 @@ +{ Copyright (c) 2025 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Zipformer CTC model +to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program zipformer_ctc; + +{$mode objfpc} + +uses + sherpa_onnx, + DateUtils, + SysUtils; + +var + Wave: TSherpaOnnxWave; + WaveFilename: AnsiString; + + Config: TSherpaOnnxOfflineRecognizerConfig; + Recognizer: TSherpaOnnxOfflineRecognizer; + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; + + Start: TDateTime; + Stop: TDateTime; + + Elapsed: Single; + Duration: Single; + RealTimeFactor: Single; +begin + Initialize(Config); + + Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + WaveFilename := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav'; + + Wave := SherpaOnnxReadWave(WaveFilename); + + Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config); + Stream := Recognizer.CreateStream(); + Start := Now; + + Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + + RecognitionResult := Recognizer.GetResult(Stream); + + Stop := Now; + + Elapsed := MilliSecondsBetween(Stop, Start) / 1000; + Duration := Length(Wave.Samples) / Wave.SampleRate; + RealTimeFactor := Elapsed / Duration; + + WriteLn(RecognitionResult.ToString); + WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads])); + WriteLn(Format('Elapsed %.3f s', [Elapsed])); + WriteLn(Format('Wave duration %.3f s', [Duration])); + WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor])); + + {Free resources to avoid memory leak. + + Note: You don't need to invoke them for this simple script. + However, you have to invoke them in your own large/complex project. + } + FreeAndNil(Stream); + FreeAndNil(Recognizer); +end. diff --git a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore index d499ad3b..0569b324 100644 --- a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore +++ b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore @@ -2,3 +2,5 @@ vad_with_whisper vad_with_sense_voice vad_with_moonshine +vad_with_zipformer_ctc +vad_with_dolphin diff --git a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-zipformer-ctc.sh b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-zipformer-ctc.sh new file mode 100755 index 00000000..dec7a6b0 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-zipformer-ctc.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./vad_with_zipformer_ctc.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./vad_with_zipformer_ctc diff --git a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_zipformer_ctc.pas b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_zipformer_ctc.pas new file mode 100644 index 00000000..12b5ce69 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_zipformer_ctc.pas @@ -0,0 +1,135 @@ +{ Copyright (c) 2025 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Zipformer CTC model +with silero VAD to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program vad_with_zipformer_ctc; + +{$mode objfpc} + +uses + sherpa_onnx, + SysUtils; + +function CreateVad(): TSherpaOnnxVoiceActivityDetector; +var + Config: TSherpaOnnxVadModelConfig; + + SampleRate: Integer; + WindowSize: Integer; +begin + Initialize(Config); + + SampleRate := 16000; {Please don't change it unless you know the details} + WindowSize := 512; {Please don't change it unless you know the details} + + Config.SileroVad.Model := './silero_vad.onnx'; + Config.SileroVad.MinSpeechDuration := 0.5; + Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.Threshold := 0.5; + Config.SileroVad.WindowSize := WindowSize; + Config.NumThreads:= 1; + Config.Debug:= True; + Config.Provider:= 'cpu'; + Config.SampleRate := SampleRate; + + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30); +end; + +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer; +var + Config: TSherpaOnnxOfflineRecognizerConfig; +begin + Initialize(Config); + + Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + Result := TSherpaOnnxOfflineRecognizer.Create(Config); +end; + +var + Wave: TSherpaOnnxWave; + + Recognizer: TSherpaOnnxOfflineRecognizer; + Vad: TSherpaOnnxVoiceActivityDetector; + + Offset: Integer; + WindowSize: Integer; + SpeechSegment: TSherpaOnnxSpeechSegment; + + Start: Single; + Duration: Single; + + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; +begin + Vad := CreateVad(); + Recognizer := CreateOfflineRecognizer(); + + Wave := SherpaOnnxReadWave('./lei-jun-test.wav'); + if Wave.SampleRate <> Vad.Config.SampleRate then + begin + WriteLn(Format('Expected sample rate: %d. Given: %d', + [Vad.Config.SampleRate, Wave.SampleRate])); + + Exit; + end; + + WindowSize := Vad.Config.SileroVad.WindowSize; + Offset := 0; + while Offset + WindowSize <= Length(Wave.Samples) do + begin + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); + Offset += WindowSize; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + end; + + Vad.Flush; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + + FreeAndNil(Recognizer); + FreeAndNil(Vad); +end. diff --git a/python-api-examples/offline-zipformer-ctc-decode-files.py b/python-api-examples/offline-zipformer-ctc-decode-files.py new file mode 100755 index 00000000..7ecd9634 --- /dev/null +++ b/python-api-examples/offline-zipformer-ctc-decode-files.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +""" +This file shows how to use a non-streaming zipformer CTC model from icefall +to decode files. + +Please download model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + +""" + +from pathlib import Path + +import sherpa_onnx +import soundfile as sf + + +def create_recognizer(): + model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx" + tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt" + test_wav = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav" + + if not Path(model).is_file() or not Path(test_wav).is_file(): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + return ( + sherpa_onnx.OfflineRecognizer.from_zipformer_ctc( + model=model, + tokens=tokens, + debug=True, + ), + test_wav, + ) + + +def main(): + recognizer, wave_filename = create_recognizer() + + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + + # audio is a 1-D float32 numpy array normalized to the range [-1, 1] + # sample_rate does not need to be 16000 Hz + + stream = recognizer.create_stream() + stream.accept_waveform(sample_rate, audio) + recognizer.decode_stream(stream) + print(wave_filename) + print(stream.result) + + +if __name__ == "__main__": + main() diff --git a/scripts/apk/generate-asr-apk-script.py b/scripts/apk/generate-asr-apk-script.py index 5e44b8ac..9578b0ed 100755 --- a/scripts/apk/generate-asr-apk-script.py +++ b/scripts/apk/generate-asr-apk-script.py @@ -344,7 +344,7 @@ def get_models(): """, ), Model( - model_name="sherpa-onnx-streaming-zipformer-ctc-fp16-zh-2025-06-30", + model_name="sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30", idx=19, lang="zh", short_name="large_zipformer_fp16", @@ -360,6 +360,26 @@ def get_models(): ls -lh + popd + """, + ), + Model( + model_name="sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30", + idx=20, + lang="zh", + short_name="large_zipformer_int8", + rule_fsts="itn_zh_number.fst", + cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi + pushd $model_name + rm -fv bpe.model + + rm -rf test_wavs + + ls -lh + popd """, ), diff --git a/scripts/apk/generate-vad-asr-apk-script.py b/scripts/apk/generate-vad-asr-apk-script.py index 8f81b084..67079797 100755 --- a/scripts/apk/generate-vad-asr-apk-script.py +++ b/scripts/apk/generate-vad-asr-apk-script.py @@ -548,6 +548,23 @@ def get_models(): ls -lh + popd + """, + ), + Model( + model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03", + idx=31, + lang="zh", + lang2="Chinese", + short_name="zipformer_2025_07_03", + cmd=""" + pushd $model_name + + rm -rfv test_wavs + rm -rfv bbpe.model + + ls -lh + popd """, ), diff --git a/scripts/dotnet/OfflineModelConfig.cs b/scripts/dotnet/OfflineModelConfig.cs index f184b850..9adf1cda 100644 --- a/scripts/dotnet/OfflineModelConfig.cs +++ b/scripts/dotnet/OfflineModelConfig.cs @@ -27,6 +27,7 @@ namespace SherpaOnnx Moonshine = new OfflineMoonshineModelConfig(); FireRedAsr = new OfflineFireRedAsrModelConfig(); Dolphin = new OfflineDolphinModelConfig(); + ZipformerCtc = new OfflineZipformerCtcModelConfig(); } public OfflineTransducerModelConfig Transducer; public OfflineParaformerModelConfig Paraformer; @@ -60,5 +61,6 @@ namespace SherpaOnnx public OfflineMoonshineModelConfig Moonshine; public OfflineFireRedAsrModelConfig FireRedAsr; public OfflineDolphinModelConfig Dolphin; + public OfflineZipformerCtcModelConfig ZipformerCtc; } } diff --git a/scripts/dotnet/OfflineZipformerCtcModelConfig.cs b/scripts/dotnet/OfflineZipformerCtcModelConfig.cs new file mode 100644 index 00000000..ce183560 --- /dev/null +++ b/scripts/dotnet/OfflineZipformerCtcModelConfig.cs @@ -0,0 +1,18 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + + [StructLayout(LayoutKind.Sequential)] + public struct OfflineZipformerCtcModelConfig + { + public OfflineZipformerCtcModelConfig() + { + Model = ""; + } + [MarshalAs(UnmanagedType.LPStr)] + public string Model; + } +} diff --git a/scripts/go/_internal/non-streaming-decode-files/run-zipformer-ctc.sh b/scripts/go/_internal/non-streaming-decode-files/run-zipformer-ctc.sh new file mode 120000 index 00000000..82b2ac15 --- /dev/null +++ b/scripts/go/_internal/non-streaming-decode-files/run-zipformer-ctc.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 320bdcba..a055d26b 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -398,6 +398,10 @@ type OfflineNemoEncDecCtcModelConfig struct { Model string // Path to the model, e.g., model.onnx or model.int8.onnx } +type OfflineZipformerCtcModelConfig struct { + Model string // Path to the model, e.g., model.onnx or model.int8.onnx +} + type OfflineDolphinModelConfig struct { Model string // Path to the model, e.g., model.onnx or model.int8.onnx } @@ -439,16 +443,17 @@ type OfflineLMConfig struct { } type OfflineModelConfig struct { - Transducer OfflineTransducerModelConfig - Paraformer OfflineParaformerModelConfig - NemoCTC OfflineNemoEncDecCtcModelConfig - Whisper OfflineWhisperModelConfig - Tdnn OfflineTdnnModelConfig - SenseVoice OfflineSenseVoiceModelConfig - Moonshine OfflineMoonshineModelConfig - FireRedAsr OfflineFireRedAsrModelConfig - Dolphin OfflineDolphinModelConfig - Tokens string // Path to tokens.txt + Transducer OfflineTransducerModelConfig + Paraformer OfflineParaformerModelConfig + NemoCTC OfflineNemoEncDecCtcModelConfig + Whisper OfflineWhisperModelConfig + Tdnn OfflineTdnnModelConfig + SenseVoice OfflineSenseVoiceModelConfig + Moonshine OfflineMoonshineModelConfig + FireRedAsr OfflineFireRedAsrModelConfig + Dolphin OfflineDolphinModelConfig + ZipformerCtc OfflineZipformerCtcModelConfig + Tokens string // Path to tokens.txt // Number of threads to use for neural network computation NumThreads int @@ -540,6 +545,7 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder) c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model) + c.model_config.zipformer_ctc.model = C.CString(config.ModelConfig.ZipformerCtc.Model) c.model_config.tokens = C.CString(config.ModelConfig.Tokens) @@ -653,11 +659,22 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig) C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder)) c.model_config.fire_red_asr.encoder = nil } + if c.model_config.fire_red_asr.decoder != nil { C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder)) c.model_config.fire_red_asr.decoder = nil } + if c.model_config.dolphin.model != nil { + C.free(unsafe.Pointer(c.model_config.dolphin.model)) + c.model_config.dolphin.model = nil + } + + if c.model_config.zipformer_ctc.model != nil { + C.free(unsafe.Pointer(c.model_config.zipformer_ctc.model)) + c.model_config.zipformer_ctc.model = nil + } + if c.model_config.tokens != nil { C.free(unsafe.Pointer(c.model_config.tokens)) c.model_config.tokens = nil diff --git a/scripts/wasm/generate-vad-asr.py b/scripts/wasm/generate-vad-asr.py index ab2cd722..209ed24d 100755 --- a/scripts/wasm/generate-vad-asr.py +++ b/scripts/wasm/generate-vad-asr.py @@ -212,6 +212,21 @@ def get_models(): git diff """, ), + Model( + model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc", + short_name="vad-asr-zh-zipformer-ctc", + cmd=""" + pushd $model_name + mv model.int8.onnx ../zipformer-ctc.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer CTC supporting Chinese 中文/g' ../index.html + git diff + """, + ), ] return models diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 2739b5f4..06c3b061 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig( recognizer_config.model_config.dolphin.model = SHERPA_ONNX_OR(config->model_config.dolphin.model, ""); + recognizer_config.model_config.zipformer_ctc.model = + SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, ""); + recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, ""); recognizer_config.lm_config.scale = diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 02df4372..5c96fe70 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig { const char *model; } SherpaOnnxOfflineDolphinModelConfig; +SHERPA_ONNX_API typedef struct SherpaOnnxOfflineZipformerCtcModelConfig { + const char *model; +} SherpaOnnxOfflineZipformerCtcModelConfig; + SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { SherpaOnnxOfflineTransducerModelConfig transducer; SherpaOnnxOfflineParaformerModelConfig paraformer; @@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { SherpaOnnxOfflineMoonshineModelConfig moonshine; SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr; SherpaOnnxOfflineDolphinModelConfig dolphin; + SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc; } SherpaOnnxOfflineModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { diff --git a/sherpa-onnx/c-api/cxx-api.cc b/sherpa-onnx/c-api/cxx-api.cc index ee50fefd..98a167a5 100644 --- a/sherpa-onnx/c-api/cxx-api.cc +++ b/sherpa-onnx/c-api/cxx-api.cc @@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create( c.model_config.dolphin.model = config.model_config.dolphin.model.c_str(); + c.model_config.zipformer_ctc.model = + config.model_config.zipformer_ctc.model.c_str(); + c.lm_config.model = config.lm_config.model.c_str(); c.lm_config.scale = config.lm_config.scale; diff --git a/sherpa-onnx/c-api/cxx-api.h b/sherpa-onnx/c-api/cxx-api.h index e755040a..1ca0231f 100644 --- a/sherpa-onnx/c-api/cxx-api.h +++ b/sherpa-onnx/c-api/cxx-api.h @@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig { std::string model; }; +struct SHERPA_ONNX_API OfflineZipformerCtcModelConfig { + std::string model; +}; + struct SHERPA_ONNX_API OfflineMoonshineModelConfig { std::string preprocessor; std::string encoder; @@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig { OfflineMoonshineModelConfig moonshine; OfflineFireRedAsrModelConfig fire_red_asr; OfflineDolphinModelConfig dolphin; + OfflineZipformerCtcModelConfig zipformer_ctc; }; struct SHERPA_ONNX_API OfflineLMConfig { diff --git a/sherpa-onnx/csrc/offline-ctc-model.cc b/sherpa-onnx/csrc/offline-ctc-model.cc index fb3089fa..f115af90 100644 --- a/sherpa-onnx/csrc/offline-ctc-model.cc +++ b/sherpa-onnx/csrc/offline-ctc-model.cc @@ -113,6 +113,16 @@ std::unique_ptr OfflineCtcModel::Create( const OfflineModelConfig &config) { if (!config.dolphin.model.empty()) { return std::make_unique(config); + } else if (!config.nemo_ctc.model.empty()) { + return std::make_unique(config); + } else if (!config.tdnn.model.empty()) { + return std::make_unique(config); + } else if (!config.zipformer_ctc.model.empty()) { + return std::make_unique(config); + } else if (!config.wenet_ctc.model.empty()) { + return std::make_unique(config); + } else if (!config.telespeech_ctc.empty()) { + return std::make_unique(config); } // TODO(fangjun): Refactor it. We don't need to use model_type here @@ -167,6 +177,16 @@ std::unique_ptr OfflineCtcModel::Create( Manager *mgr, const OfflineModelConfig &config) { if (!config.dolphin.model.empty()) { return std::make_unique(mgr, config); + } else if (!config.nemo_ctc.model.empty()) { + return std::make_unique(mgr, config); + } else if (!config.tdnn.model.empty()) { + return std::make_unique(mgr, config); + } else if (!config.zipformer_ctc.model.empty()) { + return std::make_unique(mgr, config); + } else if (!config.wenet_ctc.model.empty()) { + return std::make_unique(mgr, config); + } else if (!config.telespeech_ctc.empty()) { + return std::make_unique(mgr, config); } // TODO(fangjun): Refactor it. We don't need to use model_type here diff --git a/sherpa-onnx/java-api/Makefile b/sherpa-onnx/java-api/Makefile index 6bc067e5..fd699e88 100644 --- a/sherpa-onnx/java-api/Makefile +++ b/sherpa-onnx/java-api/Makefile @@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java java_files += OfflineFireRedAsrModelConfig.java java_files += OfflineMoonshineModelConfig.java java_files += OfflineNemoEncDecCtcModelConfig.java +java_files += OfflineZipformerCtcModelConfig.java java_files += OfflineSenseVoiceModelConfig.java java_files += OfflineDolphinModelConfig.java java_files += OfflineModelConfig.java diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java index 34f105b6..a0cdfaef 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java @@ -11,6 +11,7 @@ public class OfflineModelConfig { private final OfflineNemoEncDecCtcModelConfig nemo; private final OfflineSenseVoiceModelConfig senseVoice; private final OfflineDolphinModelConfig dolphin; + private final OfflineZipformerCtcModelConfig zipformerCtc; private final String teleSpeech; private final String tokens; private final int numThreads; @@ -28,6 +29,7 @@ public class OfflineModelConfig { this.fireRedAsr = builder.fireRedAsr; this.moonshine = builder.moonshine; this.nemo = builder.nemo; + this.zipformerCtc = builder.zipformerCtc; this.senseVoice = builder.senseVoice; this.dolphin = builder.dolphin; this.teleSpeech = builder.teleSpeech; @@ -52,7 +54,7 @@ public class OfflineModelConfig { return transducer; } - public OfflineWhisperModelConfig getZipformer2Ctc() { + public OfflineWhisperModelConfig getWhisper() { return whisper; } @@ -68,6 +70,14 @@ public class OfflineModelConfig { return dolphin; } + public OfflineNemoEncDecCtcModelConfig getNemo() { + return nemo; + } + + public OfflineZipformerCtcModelConfig getZipformerCtc() { + return zipformerCtc; + } + public String getTokens() { return tokens; } @@ -109,6 +119,7 @@ public class OfflineModelConfig { private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build(); private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build(); + private OfflineZipformerCtcModelConfig zipformerCtc = OfflineZipformerCtcModelConfig.builder().build(); private String teleSpeech = ""; private String tokens = ""; private int numThreads = 1; @@ -142,6 +153,11 @@ public class OfflineModelConfig { return this; } + public Builder setZipformerCtc(OfflineZipformerCtcModelConfig zipformerCtc) { + this.zipformerCtc = zipformerCtc; + return this; + } + public Builder setTeleSpeech(String teleSpeech) { this.teleSpeech = teleSpeech; return this; diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig.java new file mode 100644 index 00000000..115f0c2d --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig.java @@ -0,0 +1,32 @@ +// Copyright 2025 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineZipformerCtcModelConfig { + private final String model; + + private OfflineZipformerCtcModelConfig(Builder builder) { + this.model = builder.model; + } + + public static Builder builder() { + return new Builder(); + } + + public String getModel() { + return model; + } + + public static class Builder { + private String model = ""; + + public OfflineZipformerCtcModelConfig build() { + return new OfflineZipformerCtcModelConfig(this); + } + + public Builder setModel(String model) { + this.model = model; + return this; + } + } +} diff --git a/sherpa-onnx/jni/offline-recognizer.cc b/sherpa-onnx/jni/offline-recognizer.cc index 391ec422..7652c398 100644 --- a/sherpa-onnx/jni/offline-recognizer.cc +++ b/sherpa-onnx/jni/offline-recognizer.cc @@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { ans.model_config.nemo_ctc.model = p; env->ReleaseStringUTFChars(s, p); + // zipformer ctc + fid = + env->GetFieldID(model_config_cls, "zipformerCtc", + "Lcom/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig;"); + jobject zipformer_ctc_config = env->GetObjectField(model_config, fid); + jclass zipformer_ctc_config_cls = env->GetObjectClass(zipformer_ctc_config); + + fid = + env->GetFieldID(zipformer_ctc_config_cls, "model", "Ljava/lang/String;"); + + s = (jstring)env->GetObjectField(zipformer_ctc_config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model_config.zipformer_ctc.model = p; + env->ReleaseStringUTFChars(s, p); + // dolphin fid = env->GetFieldID(model_config_cls, "dolphin", "Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;"); diff --git a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt index a06c23a9..6a6f19db 100644 --- a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt @@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig( var model: String = "", ) +data class OfflineZipformerCtcModelConfig( + var model: String = "", +) + data class OfflineWhisperModelConfig( var encoder: String = "", var decoder: String = "", @@ -64,6 +68,7 @@ data class OfflineModelConfig( var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(), var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(), + var zipformerCtc: OfflineZipformerCtcModelConfig = OfflineZipformerCtcModelConfig(), var teleSpeech: String = "", var numThreads: Int = 1, var debug: Boolean = false, @@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { modelType = "nemo_transducer", ) } + + 31 -> { + val modelDir = "sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03" + return OfflineModelConfig( + zipformerCtc = OfflineZipformerCtcModelConfig( + model = "$modelDir/model.int8.onnx", + ), + tokens = "$modelDir/tokens.txt", + ) + } } return null } diff --git a/sherpa-onnx/kotlin-api/OnlineRecognizer.kt b/sherpa-onnx/kotlin-api/OnlineRecognizer.kt index e19ae3ed..1dff88ec 100644 --- a/sherpa-onnx/kotlin-api/OnlineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OnlineRecognizer.kt @@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { model = "$modelDir/model.onnx", ), tokens = "$modelDir/tokens.txt", + modelType = "zipformer2", ) } @@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? { model = "$modelDir/model.fp16.onnx", ), tokens = "$modelDir/tokens.txt", + modelType = "zipformer2", ) } diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index d50a3cb8..bf5e2ea7 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -284,6 +284,11 @@ type function ToString: AnsiString; end; + TSherpaOnnxOfflineZipformerCtcModelConfig = record + Model: AnsiString; + function ToString: AnsiString; + end; + TSherpaOnnxOfflineWhisperModelConfig = record Encoder: AnsiString; Decoder: AnsiString; @@ -346,6 +351,7 @@ type Moonshine: TSherpaOnnxOfflineMoonshineModelConfig; FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig; Dolphin: TSherpaOnnxOfflineDolphinModelConfig; + ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); function ToString: AnsiString; end; @@ -726,6 +732,9 @@ type SherpaOnnxOfflineDolphinModelConfig = record Model: PAnsiChar; end; + SherpaOnnxOfflineZipformerCtcModelConfig = record + Model: PAnsiChar; + end; SherpaOnnxOfflineWhisperModelConfig = record Encoder: PAnsiChar; Decoder: PAnsiChar; @@ -773,6 +782,7 @@ type Moonshine: SherpaOnnxOfflineMoonshineModelConfig; FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig; Dolphin: SherpaOnnxOfflineDolphinModelConfig; + ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig; end; SherpaOnnxOfflineRecognizerConfig = record @@ -1536,6 +1546,12 @@ begin [Self.Model]); end; +function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)', + [Self.Model]); +end; + function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString; begin Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' + @@ -1610,14 +1626,15 @@ begin 'SenseVoice := %s, ' + 'Moonshine := %s, ' + 'FireRedAsr := %s, ' + - 'Dolphin := %s' + + 'Dolphin := %s, ' + + 'ZipformerCtc := %s' + ')', [Self.Transducer.ToString, Self.Paraformer.ToString, Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString, Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider, Self.ModelType, Self.ModelingUnit, Self.BpeVocab, Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString, - Self.FireRedAsr.ToString, Self.Dolphin.ToString + Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString ]); end; @@ -1688,6 +1705,7 @@ begin C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder); C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model); + C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model); C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model); C.LMConfig.Scale := Config.LMConfig.Scale; diff --git a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py index eae50c4e..5a0475ec 100644 --- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py @@ -527,6 +527,87 @@ class OfflineRecognizer(object): self.config = recognizer_config return self + @classmethod + def from_zipformer_ctc( + cls, + model: str, + tokens: str, + num_threads: int = 1, + sample_rate: int = 16000, + feature_dim: int = 80, + decoding_method: str = "greedy_search", + debug: bool = False, + provider: str = "cpu", + rule_fsts: str = "", + rule_fars: str = "", + hr_dict_dir: str = "", + hr_rule_fsts: str = "", + hr_lexicon: str = "", + ): + """ + Please refer to + ``_ + to download pre-trained models for different languages, e.g., Chinese, + English, etc. + + Args: + model: + Path to ``model.onnx``. + tokens: + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two + columns:: + + symbol integer_id + + num_threads: + Number of threads for neural network computation. + sample_rate: + Sample rate of the training data used to train the model. + feature_dim: + Dimension of the feature used to train the model. + decoding_method: + Valid values are greedy_search. + debug: + True to show debug messages. + provider: + onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. + """ + self = cls.__new__(cls) + model_config = OfflineModelConfig( + zipformer_ctc=OfflineZipformerCtcModelConfig(model=model), + tokens=tokens, + num_threads=num_threads, + debug=debug, + provider=provider, + ) + + feat_config = FeatureExtractorConfig( + sampling_rate=sample_rate, + feature_dim=feature_dim, + ) + + recognizer_config = OfflineRecognizerConfig( + feat_config=feat_config, + model_config=model_config, + decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, + hr=HomophoneReplacerConfig( + dict_dir=hr_dict_dir, + lexicon=hr_lexicon, + rule_fsts=hr_rule_fsts, + ), + ) + self.recognizer = _Recognizer(recognizer_config) + self.config = recognizer_config + return self + @classmethod def from_nemo_ctc( cls, diff --git a/swift-api-examples/.gitignore b/swift-api-examples/.gitignore index 2363d0ec..adcecceb 100644 --- a/swift-api-examples/.gitignore +++ b/swift-api-examples/.gitignore @@ -16,3 +16,6 @@ tts-kokoro-en tts-kokoro-zh-en speech-enhancement-gtcrn decode-file-sense-voice-with-hr +test-version +zipformer-ctc-asr +dolphin-ctc-asr diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 4c01975e..29da5dea 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig( ) } +func sherpaOnnxOfflineZipformerCtcModelConfig( + model: String = "" +) -> SherpaOnnxOfflineZipformerCtcModelConfig { + return SherpaOnnxOfflineZipformerCtcModelConfig( + model: toCPointer(model) + ) +} + func sherpaOnnxOfflineNemoEncDecCtcModelConfig( model: String = "" ) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig { @@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig( senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(), moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(), fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(), - dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig() + dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(), + zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig = + sherpaOnnxOfflineZipformerCtcModelConfig() ) -> SherpaOnnxOfflineModelConfig { return SherpaOnnxOfflineModelConfig( transducer: transducer, @@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig( sense_voice: senseVoice, moonshine: moonshine, fire_red_asr: fireRedAsr, - dolphin: dolphin + dolphin: dolphin, + zipformer_ctc: zipformerCtc ) } diff --git a/swift-api-examples/run-zipformer-ctc-asr.sh b/swift-api-examples/run-zipformer-ctc-asr.sh new file mode 100755 index 00000000..f04ecaec --- /dev/null +++ b/swift-api-examples/run-zipformer-ctc-asr.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d ../build-swift-macos ]; then + echo "Please run ../build-swift-macos.sh first!" + exit 1 +fi + +if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then + echo "Please download the pre-trained model for testing." + echo "You can refer to" + echo "" + echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese" + echo "" + echo "for help" + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + + tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2 + ls -lh sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03 +fi + +if [ ! -e ./zipformer-ctc-asr ]; then + # Note: We use -lc++ to link against libc++ instead of libstdc++ + swiftc \ + -lc++ \ + -I ../build-swift-macos/install/include \ + -import-objc-header ./SherpaOnnx-Bridging-Header.h \ + ./zipformer-ctc-asr.swift ./SherpaOnnx.swift \ + -L ../build-swift-macos/install/lib/ \ + -l sherpa-onnx \ + -l onnxruntime \ + -o zipformer-ctc-asr + + strip zipformer-ctc-asr +else + echo "./zipformer-ctc-asr exists - skip building" +fi + +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH +./zipformer-ctc-asr diff --git a/swift-api-examples/zipformer-ctc-asr.swift b/swift-api-examples/zipformer-ctc-asr.swift new file mode 100644 index 00000000..d55b2ecb --- /dev/null +++ b/swift-api-examples/zipformer-ctc-asr.swift @@ -0,0 +1,66 @@ +import AVFoundation + +extension AudioBuffer { + func array() -> [Float] { + return Array(UnsafeBufferPointer(self)) + } +} + +extension AVAudioPCMBuffer { + func array() -> [Float] { + return self.audioBufferList.pointee.mBuffers.array() + } +} + +func run() { + let model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx" + let tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt" + + let zipformerCtc = sherpaOnnxOfflineZipformerCtcModelConfig( + model: model + ) + + let modelConfig = sherpaOnnxOfflineModelConfig( + tokens: tokens, + debug: 0, + zipformerCtc: zipformerCtc + ) + + let featConfig = sherpaOnnxFeatureConfig( + sampleRate: 16000, + featureDim: 80 + ) + var config = sherpaOnnxOfflineRecognizerConfig( + featConfig: featConfig, + modelConfig: modelConfig + ) + + let recognizer = SherpaOnnxOfflineRecognizer(config: &config) + + let filePath = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav" + let fileURL: NSURL = NSURL(fileURLWithPath: filePath) + let audioFile = try! AVAudioFile(forReading: fileURL as URL) + + let audioFormat = audioFile.processingFormat + assert(audioFormat.channelCount == 1) + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) + + let audioFrameCount = UInt32(audioFile.length) + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) + + try! audioFile.read(into: audioFileBuffer!) + let array: [Float]! = audioFileBuffer?.array() + let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate)) + print("\nresult is:\n\(result.text)") + if result.timestamps.count != 0 { + print("\ntimestamps is:\n\(result.timestamps)") + } + +} + +@main +struct App { + static func main() { + run() + } +} diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 3c6324ef..78206ec0 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -43,6 +43,10 @@ function freeConfig(config, Module) { freeConfig(config.dolphin, Module) } + if ('zipformerCtc' in config) { + freeConfig(config.zipformerCtc, Module) + } + if ('moonshine' in config) { freeConfig(config.moonshine, Module) } @@ -627,6 +631,23 @@ function initSherpaOnnxOfflineDolphinModelConfig(config, Module) { } } +function initSherpaOnnxOfflineZipformerCtcModelConfig(config, Module) { + const n = Module.lengthBytesUTF8(config.model || '') + 1; + + const buffer = Module._malloc(n); + + const len = 1 * 4; // 1 pointer + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model || '', buffer, n); + + Module.setValue(ptr, buffer, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + function initSherpaOnnxOfflineWhisperModelConfig(config, Module) { const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; @@ -840,6 +861,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { }; } + if (!('zipformerCtc' in config)) { + config.zipformerCtc = { + model: '', + }; + } + if (!('whisper' in config)) { config.whisper = { encoder: '', @@ -906,9 +933,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const dolphin = initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module); + const zipformerCtc = + initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module); + const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len + - dolphin.len; + dolphin.len + zipformerCtc.len; const ptr = Module._malloc(len); @@ -1010,11 +1040,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset); offset += dolphin.len; + Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset); + offset += zipformerCtc.len; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn, senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr, - dolphin: dolphin + dolphin: dolphin, zipformerCtc: zipformerCtc } } diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index 0a5bb6f6..d93c9812 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -13,6 +13,7 @@ extern "C" { static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, ""); +static_assert(sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, ""); static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, ""); @@ -31,7 +32,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) + sizeof(SherpaOnnxOfflineMoonshineModelConfig) + sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) + - sizeof(SherpaOnnxOfflineDolphinModelConfig), + sizeof(SherpaOnnxOfflineDolphinModelConfig) + + sizeof(SherpaOnnxOfflineZipformerCtcModelConfig), ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); @@ -77,6 +79,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { auto moonshine = &model_config->moonshine; auto fire_red_asr = &model_config->fire_red_asr; auto dolphin = &model_config->dolphin; + auto zipformer_ctc = &model_config->zipformer_ctc; fprintf(stdout, "----------offline transducer model config----------\n"); fprintf(stdout, "encoder: %s\n", transducer->encoder); @@ -117,6 +120,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "----------offline Dolphin model config----------\n"); fprintf(stdout, "model: %s\n", dolphin->model); + fprintf(stdout, "----------offline zipformer ctc model config----------\n"); + fprintf(stdout, "model: %s\n", zipformer_ctc->model); + fprintf(stdout, "tokens: %s\n", model_config->tokens); fprintf(stdout, "num_threads: %d\n", model_config->num_threads); fprintf(stdout, "provider: %s\n", model_config->provider); diff --git a/wasm/vad-asr/app-vad-asr.js b/wasm/vad-asr/app-vad-asr.js index 159e48cc..226255e4 100644 --- a/wasm/vad-asr/app-vad-asr.js +++ b/wasm/vad-asr/app-vad-asr.js @@ -117,6 +117,10 @@ function initOfflineRecognizer() { }; } else if (fileExists('dolphin.onnx')) { config.modelConfig.dolphin = {model: './dolphin.onnx'}; + } else if (fileExists('zipformer-ctc.onnx')) { + // you need to rename model.int8.onnx from zipformer CTC to + // zipformer-ctc.onnx + config.modelConfig.zipformerCtc = {model: './zipformer-ctc.onnx'}; } else { console.log('Please specify a model.'); alert('Please specify a model.');