Support non-streaming zipformer CTC ASR models (#2340)
This PR adds support for non-streaming Zipformer CTC ASR models across multiple language bindings, WebAssembly, examples, and CI workflows. - Introduces a new OfflineZipformerCtcModelConfig in C/C++, Python, Swift, Java, Kotlin, Go, Dart, Pascal, and C# APIs - Updates initialization, freeing, and recognition logic to include Zipformer CTC in WASM and Node.js - Adds example scripts and CI steps for downloading, building, and running Zipformer CTC models Model doc is available at https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html
This commit is contained in:
8
.github/scripts/test-dart.sh
vendored
8
.github/scripts/test-dart.sh
vendored
@@ -6,6 +6,10 @@ cd dart-api-examples
|
||||
|
||||
pushd non-streaming-asr
|
||||
|
||||
echo '----------Zipformer CTC----------'
|
||||
./run-zipformer-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
|
||||
echo '----------SenseVoice----------'
|
||||
./run-sense-voice-with-hr.sh
|
||||
./run-sense-voice.sh
|
||||
@@ -114,6 +118,10 @@ popd
|
||||
|
||||
pushd vad-with-non-streaming-asr
|
||||
|
||||
echo '----------Zipformer CTC----------'
|
||||
./run-zipformer-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
|
||||
echo '----------Dolphin CTC----------'
|
||||
./run-dolphin-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
|
||||
75
.github/scripts/test-dot-net.sh
vendored
75
.github/scripts/test-dot-net.sh
vendored
@@ -6,43 +6,11 @@ cd ./version-test
|
||||
./run.sh
|
||||
ls -lh
|
||||
|
||||
cd ../speech-enhancement-gtcrn
|
||||
./run.sh
|
||||
ls -lh
|
||||
|
||||
cd ../kokoro-tts
|
||||
./run-kokoro.sh
|
||||
ls -lh
|
||||
|
||||
cd ../offline-tts
|
||||
./run-matcha-zh.sh
|
||||
ls -lh *.wav
|
||||
./run-matcha-en.sh
|
||||
ls -lh *.wav
|
||||
./run-aishell3.sh
|
||||
ls -lh *.wav
|
||||
./run-piper.sh
|
||||
ls -lh *.wav
|
||||
./run-hf-fanchen.sh
|
||||
ls -lh *.wav
|
||||
ls -lh
|
||||
|
||||
pushd ../..
|
||||
|
||||
mkdir tts
|
||||
|
||||
cp -v dotnet-examples/kokoro-tts/*.wav ./tts
|
||||
cp -v dotnet-examples/offline-tts/*.wav ./tts
|
||||
popd
|
||||
|
||||
cd ../offline-speaker-diarization
|
||||
./run.sh
|
||||
rm -rfv *.onnx
|
||||
rm -fv *.wav
|
||||
rm -rfv sherpa-onnx-pyannote-*
|
||||
|
||||
cd ../offline-decode-files
|
||||
|
||||
./run-zipformer-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
|
||||
./run-dolphin-ctc.sh
|
||||
rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
|
||||
|
||||
@@ -82,6 +50,41 @@ rm -rf sherpa-onnx-*
|
||||
./run-tdnn-yesno.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
|
||||
cd ../speech-enhancement-gtcrn
|
||||
./run.sh
|
||||
ls -lh
|
||||
|
||||
cd ../kokoro-tts
|
||||
./run-kokoro.sh
|
||||
ls -lh
|
||||
|
||||
cd ../offline-tts
|
||||
./run-matcha-zh.sh
|
||||
ls -lh *.wav
|
||||
./run-matcha-en.sh
|
||||
ls -lh *.wav
|
||||
./run-aishell3.sh
|
||||
ls -lh *.wav
|
||||
./run-piper.sh
|
||||
ls -lh *.wav
|
||||
./run-hf-fanchen.sh
|
||||
ls -lh *.wav
|
||||
ls -lh
|
||||
|
||||
pushd ../..
|
||||
|
||||
mkdir tts
|
||||
|
||||
cp -v dotnet-examples/kokoro-tts/*.wav ./tts
|
||||
cp -v dotnet-examples/offline-tts/*.wav ./tts
|
||||
popd
|
||||
|
||||
cd ../offline-speaker-diarization
|
||||
./run.sh
|
||||
rm -rfv *.onnx
|
||||
rm -fv *.wav
|
||||
rm -rfv sherpa-onnx-pyannote-*
|
||||
|
||||
cd ../keyword-spotting-from-files
|
||||
./run.sh
|
||||
|
||||
@@ -115,5 +118,3 @@ rm -rf sherpa-onnx-*
|
||||
cd ../spoken-language-identification
|
||||
./run.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
|
||||
|
||||
|
||||
9
.github/scripts/test-nodejs-addon-npm.sh
vendored
9
.github/scripts/test-nodejs-addon-npm.sh
vendored
@@ -10,6 +10,15 @@ arch=$(node -p "require('os').arch()")
|
||||
platform=$(node -p "require('os').platform()")
|
||||
node_version=$(node -p "process.versions.node.split('.')[0]")
|
||||
|
||||
echo "----------non-streaming ASR Zipformer CTC----------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
node ./test_asr_non_streaming_zipformer_ctc.js
|
||||
rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
|
||||
|
||||
echo "----------non-streaming ASR NeMo parakeet tdt----------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
|
||||
tar xvf sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
|
||||
|
||||
9
.github/scripts/test-nodejs-npm.sh
vendored
9
.github/scripts/test-nodejs-npm.sh
vendored
@@ -9,6 +9,15 @@ git status
|
||||
ls -lh
|
||||
ls -lh node_modules
|
||||
|
||||
# asr with offline zipformer ctc
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
node ./test-offline-zipformer-ctc.js
|
||||
rm -rf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
|
||||
|
||||
# asr with offline dolphin ctc
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
||||
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
||||
|
||||
3
.github/scripts/test-swift.sh
vendored
3
.github/scripts/test-swift.sh
vendored
@@ -9,6 +9,9 @@ ls -lh
|
||||
|
||||
./run-test-version.sh
|
||||
|
||||
./run-zipformer-ctc-asr.sh
|
||||
rm -rf sherpa-onnx-zipformer-*
|
||||
|
||||
./run-decode-file-sense-voice-with-hr.sh
|
||||
rm -rf sherpa-onnx-sense-voice-*
|
||||
rm -rf dict lexicon.txt replace.fst test-hr.wav
|
||||
|
||||
@@ -89,6 +89,7 @@ jobs:
|
||||
make -j4 install
|
||||
|
||||
cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
|
||||
cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
|
||||
|
||||
rm -rf install/lib/pkgconfig
|
||||
rm -fv install/lib/cargs.h
|
||||
@@ -135,6 +136,7 @@ jobs:
|
||||
make -j4 install
|
||||
|
||||
cp -v bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
|
||||
cp -v bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
|
||||
|
||||
rm -rf install/lib/pkgconfig
|
||||
rm -fv install/lib/cargs.h
|
||||
|
||||
@@ -90,6 +90,7 @@ jobs:
|
||||
make install
|
||||
|
||||
cp bin/sense-voice-simulate-streaming-alsa-cxx-api install/bin
|
||||
cp bin/zipformer-ctc-simulate-streaming-alsa-cxx-api install/bin
|
||||
|
||||
ls -lh install/lib
|
||||
|
||||
|
||||
13
.github/workflows/pascal.yaml
vendored
13
.github/workflows/pascal.yaml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, macos-13, windows-latest]
|
||||
os: [ubuntu-latest, macos-latest, macos-13, windows-latest, ubuntu-22.04-arm]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -56,7 +56,7 @@ jobs:
|
||||
key: ${{ matrix.os }}
|
||||
|
||||
- name: Install Free pascal compiler (ubuntu)
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
if: matrix.os == 'ubuntu-latest' || matrix.os == 'ubuntu-22.04-arm'
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get update
|
||||
@@ -156,6 +156,10 @@ jobs:
|
||||
|
||||
pushd non-streaming-asr
|
||||
|
||||
./run-zipformer-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
|
||||
./run-dolphin-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
@@ -264,9 +268,12 @@ jobs:
|
||||
|
||||
cd ./pascal-api-examples
|
||||
|
||||
|
||||
pushd vad-with-non-streaming-asr
|
||||
|
||||
time ./run-vad-with-zipformer-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
|
||||
time ./run-vad-with-dolphin-ctc.sh
|
||||
rm -rf sherpa-onnx-*
|
||||
echo "---"
|
||||
|
||||
3
.github/workflows/run-java-test.yaml
vendored
3
.github/workflows/run-java-test.yaml
vendored
@@ -165,6 +165,9 @@ jobs:
|
||||
run: |
|
||||
cd ./java-api-examples
|
||||
|
||||
./run-non-streaming-decode-file-zipformer-ctc.sh
|
||||
rm -rf sherpa-onnx-zipformer-ctc-*
|
||||
|
||||
./run-non-streaming-decode-file-dolphin-ctc.sh
|
||||
rm -rf sherpa-onnx-dolphin-*
|
||||
|
||||
|
||||
4
.github/workflows/test-go.yaml
vendored
4
.github/workflows/test-go.yaml
vendored
@@ -184,6 +184,10 @@ jobs:
|
||||
go build
|
||||
ls -lh
|
||||
|
||||
echo "Test Zipformer CTC"
|
||||
./run-zipformer-ctc.sh
|
||||
rm -rf sherpa-onnx-zipformer-*
|
||||
|
||||
echo "Test SenseVoice ctc"
|
||||
./run-sense-voice-small-with-hr.sh
|
||||
./run-sense-voice-small.sh
|
||||
|
||||
27
.github/workflows/upload-models.yaml
vendored
27
.github/workflows/upload-models.yaml
vendored
@@ -19,12 +19,36 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: ["3.8"]
|
||||
python-version: ["3.10"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Zipformer CTC (non-streaming)
|
||||
shell: bash
|
||||
run: |
|
||||
git lfs install
|
||||
names=(
|
||||
sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
|
||||
sherpa-onnx-zipformer-ctc-zh-2025-07-03
|
||||
sherpa-onnx-zipformer-ctc-zh-fp16-2025-07-03
|
||||
)
|
||||
for name in ${names[@]}; do
|
||||
git clone https://huggingface.co/csukuangfj/$name
|
||||
pushd $name
|
||||
git lfs pull
|
||||
rm -rf .git
|
||||
rm -rfv .gitattributes
|
||||
ls -lh
|
||||
popd
|
||||
|
||||
tar cjfv $name.tar.bz2 $name
|
||||
rm -rf $name
|
||||
ls -lh *.tar.bz2
|
||||
done
|
||||
|
||||
- name: Vietnamese (zipformer)
|
||||
if: false
|
||||
shell: bash
|
||||
run: |
|
||||
rm -rf models
|
||||
@@ -76,6 +100,7 @@ jobs:
|
||||
mv models/* .
|
||||
|
||||
- name: Publish to huggingface (Vietnamese zipformer)
|
||||
if: false
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
uses: nick-fields/retry@v3
|
||||
|
||||
16
README.md
16
README.md
@@ -114,6 +114,7 @@ We also have spaces built using WebAssembly. They are listed below:
|
||||
|Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]|
|
||||
|Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]|
|
||||
|Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]|
|
||||
|VAD + speech recognition (Chinese) with [Zipformer CTC](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|[Click me][wasm-hf-vad-asr-zh-zipformer-ctc-07-03]| [地址][wasm-ms-vad-asr-zh-zipformer-ctc-07-03]|
|
||||
|VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]|
|
||||
|VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]|
|
||||
|VAD + speech recognition (English) with [Moonshine tiny][Moonshine tiny]|[Click me][wasm-hf-vad-asr-en-moonshine-tiny-en]| [地址][wasm-ms-vad-asr-en-moonshine-tiny-en]|
|
||||
@@ -141,6 +142,7 @@ We also have spaces built using WebAssembly. They are listed below:
|
||||
|----------------------------------------|------------------------------------|-----------------------------------|
|
||||
| Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]|
|
||||
| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] |
|
||||
| Simulated-streaming speech recognition | [Address][apk-simula-streaming-asr]| [点此][apk-simula-streaming-asr-cn]|
|
||||
| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] |
|
||||
| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] |
|
||||
| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] |
|
||||
@@ -250,8 +252,10 @@ for more models. The following table lists only **SOME** of them.
|
||||
|
||||
|Name | Supported Languages| Description|
|
||||
|-----|-----|----|
|
||||
|[sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english)| English | It is converted from <https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2>|
|
||||
|[Whisper tiny.en](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2)|English| See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html)|
|
||||
|[Moonshine tiny][Moonshine tiny]|English|See [also](https://github.com/usefulsensors/moonshine)|
|
||||
|[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese)|Chinese| A Zipformer CTC model|
|
||||
|[sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17][sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17]|Chinese, Cantonese, English, Korean, Japanese| 支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html)|
|
||||
|[sherpa-onnx-paraformer-zh-2024-03-09][sherpa-onnx-paraformer-zh-2024-03-09]|Chinese, English| 也支持多种中文方言. See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2024-03-09-chinese-english)|
|
||||
|[sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01][sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01]|Japanese|See [also](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01-japanese)|
|
||||
@@ -413,6 +417,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
|
||||
[wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en
|
||||
[wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en
|
||||
[SenseVoice]: https://github.com/FunAudioLLM/SenseVoice
|
||||
[wasm-hf-vad-asr-zh-zipformer-ctc-07-03]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc
|
||||
[wasm-ms-vad-asr-zh-zipformer-ctc-07-03]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc/summary
|
||||
[wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice
|
||||
[wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice
|
||||
[wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny
|
||||
@@ -423,20 +429,20 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
|
||||
[wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech
|
||||
[wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
|
||||
[wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech
|
||||
[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
|
||||
[reazonspeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
|
||||
[wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
|
||||
[wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer
|
||||
[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2
|
||||
[gigaspeech2]: https://github.com/speechcolab/gigaspeech2
|
||||
[wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer
|
||||
[wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer
|
||||
[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR
|
||||
[telespeech-asr]: https://github.com/tele-ai/telespeech-asr
|
||||
[wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
|
||||
[wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech
|
||||
[wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
|
||||
[wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
|
||||
[wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
|
||||
[wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
|
||||
[Dolphin]: https://github.com/DataoceanAI/Dolphin
|
||||
[dolphin]: https://github.com/dataoceanai/dolphin
|
||||
[wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
|
||||
[wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
|
||||
|
||||
@@ -450,6 +456,8 @@ It uses sherpa-onnx for speech-to-text and text-to-speech.
|
||||
[apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html
|
||||
[apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html
|
||||
[apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html
|
||||
[apk-simula-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr.html
|
||||
[apk-simula-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-simulate-streaming-asr-cn.html
|
||||
[apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html
|
||||
[apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html
|
||||
[apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html
|
||||
|
||||
@@ -45,6 +45,15 @@ if(SHERPA_ONNX_ENABLE_PORTAUDIO)
|
||||
sherpa-onnx-cxx-api
|
||||
portaudio_static
|
||||
)
|
||||
|
||||
add_executable(zipformer-ctc-simulate-streaming-microphone-cxx-api
|
||||
./zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
|
||||
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/microphone.cc
|
||||
)
|
||||
target_link_libraries(zipformer-ctc-simulate-streaming-microphone-cxx-api
|
||||
sherpa-onnx-cxx-api
|
||||
portaudio_static
|
||||
)
|
||||
endif()
|
||||
|
||||
if(SHERPA_ONNX_HAS_ALSA)
|
||||
@@ -57,10 +66,21 @@ if(SHERPA_ONNX_HAS_ALSA)
|
||||
portaudio_static
|
||||
)
|
||||
|
||||
add_executable(zipformer-ctc-simulate-streaming-alsa-cxx-api
|
||||
./zipformer-ctc-simulate-streaming-alsa-cxx-api.cc
|
||||
${CMAKE_CURRENT_LIST_DIR}/../sherpa-onnx/csrc/alsa.cc
|
||||
)
|
||||
target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api
|
||||
sherpa-onnx-cxx-api
|
||||
portaudio_static
|
||||
)
|
||||
|
||||
if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR})
|
||||
target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
|
||||
target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound)
|
||||
else()
|
||||
target_link_libraries(sense-voice-simulate-streaming-alsa-cxx-api asound)
|
||||
target_link_libraries(zipformer-ctc-simulate-streaming-alsa-cxx-api asound)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -0,0 +1,240 @@
|
||||
// cxx-api-examples/zipformer-ctc-simulate-streaming-alsa-cxx-api.cc
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
//
|
||||
// This file demonstrates how to use zipformer CTC with sherpa-onnx's C++ API
|
||||
// for streaming speech recognition from a microphone.
|
||||
//
|
||||
// clang-format off
|
||||
//
|
||||
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
//
|
||||
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
//
|
||||
// clang-format on
|
||||
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <chrono> // NOLINT
|
||||
#include <condition_variable> // NOLINT
|
||||
#include <iostream>
|
||||
#include <mutex> // NOLINT
|
||||
#include <queue>
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-display.h" // NOLINT
|
||||
#include "sherpa-onnx/c-api/cxx-api.h"
|
||||
#include "sherpa-onnx/csrc/alsa.h"
|
||||
|
||||
std::queue<std::vector<float>> samples_queue;
|
||||
std::condition_variable condition_variable;
|
||||
std::mutex mutex;
|
||||
bool stop = false;
|
||||
|
||||
static void Handler(int32_t /*sig*/) {
|
||||
stop = true;
|
||||
condition_variable.notify_one();
|
||||
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
|
||||
}
|
||||
|
||||
static void RecordCallback(sherpa_onnx::Alsa *alsa) {
|
||||
int32_t chunk = 0.1 * alsa->GetActualSampleRate();
|
||||
while (!stop) {
|
||||
std::vector<float> samples = alsa->Read(chunk);
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
samples_queue.emplace(std::move(samples));
|
||||
condition_variable.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
|
||||
using namespace sherpa_onnx::cxx; // NOLINT
|
||||
VadModelConfig config;
|
||||
config.silero_vad.model = "./silero_vad.onnx";
|
||||
config.silero_vad.threshold = 0.5;
|
||||
config.silero_vad.min_silence_duration = 0.1;
|
||||
config.silero_vad.min_speech_duration = 0.25;
|
||||
config.silero_vad.max_speech_duration = 8;
|
||||
config.sample_rate = 16000;
|
||||
config.debug = false;
|
||||
|
||||
VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
|
||||
if (!vad.Get()) {
|
||||
std::cerr << "Failed to create VAD. Please check your config\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
return vad;
|
||||
}
|
||||
|
||||
static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
|
||||
using namespace sherpa_onnx::cxx; // NOLINT
|
||||
OfflineRecognizerConfig config;
|
||||
|
||||
config.model_config.zipformer_ctc.model =
|
||||
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
|
||||
config.model_config.tokens =
|
||||
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
|
||||
|
||||
config.model_config.num_threads = 2;
|
||||
config.model_config.debug = false;
|
||||
|
||||
std::cout << "Loading model\n";
|
||||
OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
|
||||
if (!recognizer.Get()) {
|
||||
std::cerr << "Please check your config\n";
|
||||
exit(-1);
|
||||
}
|
||||
std::cout << "Loading model done\n";
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
int32_t main(int32_t argc, const char *argv[]) {
|
||||
const char *kUsageMessage = R"usage(
|
||||
Usage:
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
./zipformer-ctc-simulate-streaming-alsa-cxx-api device_name
|
||||
|
||||
The device name specifies which microphone to use in case there are several
|
||||
on your system. You can use
|
||||
|
||||
arecord -l
|
||||
|
||||
to find all available microphones on your computer. For instance, if it outputs
|
||||
|
||||
**** List of CAPTURE Hardware Devices ****
|
||||
card 3: UACDemoV10 [UACDemoV1.0], device 0: USB Audio [USB Audio]
|
||||
Subdevices: 1/1
|
||||
Subdevice #0: subdevice #0
|
||||
|
||||
and if you want to select card 3 and device 0 on that card, please use:
|
||||
|
||||
plughw:3,0
|
||||
|
||||
as the device_name.
|
||||
)usage";
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "%s\n", kUsageMessage);
|
||||
return -1;
|
||||
}
|
||||
|
||||
signal(SIGINT, Handler);
|
||||
|
||||
using namespace sherpa_onnx::cxx; // NOLINT
|
||||
|
||||
auto vad = CreateVad();
|
||||
auto recognizer = CreateOfflineRecognizer();
|
||||
|
||||
int32_t expected_sample_rate = 16000;
|
||||
|
||||
std::string device_name = argv[1];
|
||||
sherpa_onnx::Alsa alsa(device_name.c_str());
|
||||
fprintf(stderr, "Use recording device: %s\n", device_name.c_str());
|
||||
|
||||
if (alsa.GetExpectedSampleRate() != expected_sample_rate) {
|
||||
fprintf(stderr, "sample rate: %d != %d\n", alsa.GetExpectedSampleRate(),
|
||||
expected_sample_rate);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int32_t window_size = 512; // samples, please don't change
|
||||
|
||||
int32_t offset = 0;
|
||||
std::vector<float> buffer;
|
||||
bool speech_started = false;
|
||||
|
||||
auto started_time = std::chrono::steady_clock::now();
|
||||
|
||||
SherpaDisplay display;
|
||||
|
||||
std::thread record_thread(RecordCallback, &alsa);
|
||||
|
||||
std::cout << "Started! Please speak\n";
|
||||
|
||||
while (!stop) {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
while (samples_queue.empty() && !stop) {
|
||||
condition_variable.wait(lock);
|
||||
}
|
||||
|
||||
const auto &s = samples_queue.front();
|
||||
buffer.insert(buffer.end(), s.begin(), s.end());
|
||||
|
||||
samples_queue.pop();
|
||||
}
|
||||
|
||||
for (; offset + window_size < buffer.size(); offset += window_size) {
|
||||
vad.AcceptWaveform(buffer.data() + offset, window_size);
|
||||
if (!speech_started && vad.IsDetected()) {
|
||||
speech_started = true;
|
||||
started_time = std::chrono::steady_clock::now();
|
||||
}
|
||||
}
|
||||
if (!speech_started) {
|
||||
if (buffer.size() > 10 * window_size) {
|
||||
offset -= buffer.size() - 10 * window_size;
|
||||
buffer = {buffer.end() - 10 * window_size, buffer.end()};
|
||||
}
|
||||
}
|
||||
|
||||
auto current_time = std::chrono::steady_clock::now();
|
||||
const float elapsed_seconds =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
|
||||
started_time)
|
||||
.count() /
|
||||
1000.;
|
||||
|
||||
if (speech_started && elapsed_seconds > 0.2) {
|
||||
OfflineStream stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(expected_sample_rate, buffer.data(), buffer.size());
|
||||
|
||||
recognizer.Decode(&stream);
|
||||
|
||||
OfflineRecognizerResult result = recognizer.GetResult(&stream);
|
||||
display.UpdateText(result.text);
|
||||
display.Display();
|
||||
|
||||
started_time = std::chrono::steady_clock::now();
|
||||
}
|
||||
|
||||
while (!vad.IsEmpty()) {
|
||||
auto segment = vad.Front();
|
||||
|
||||
vad.Pop();
|
||||
|
||||
OfflineStream stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(expected_sample_rate, segment.samples.data(),
|
||||
segment.samples.size());
|
||||
|
||||
recognizer.Decode(&stream);
|
||||
|
||||
OfflineRecognizerResult result = recognizer.GetResult(&stream);
|
||||
|
||||
display.UpdateText(result.text);
|
||||
display.FinalizeCurrentSentence();
|
||||
display.Display();
|
||||
|
||||
buffer.clear();
|
||||
offset = 0;
|
||||
speech_started = false;
|
||||
}
|
||||
}
|
||||
|
||||
record_thread.join();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,237 @@
|
||||
// cxx-api-examples/zipformer-ctc-simulate-streaming-microphone-cxx-api.cc
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
//
|
||||
// This file demonstrates how to use Zipformer CTC with sherpa-onnx's C++ API
|
||||
// for streaming speech recognition from a microphone.
|
||||
//
|
||||
// clang-format off
|
||||
//
|
||||
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
//
|
||||
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
// tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
// rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
//
|
||||
// clang-format on
|
||||
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <chrono> // NOLINT
|
||||
#include <condition_variable> // NOLINT
|
||||
#include <iostream>
|
||||
#include <mutex> // NOLINT
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
#include "portaudio.h" // NOLINT
|
||||
#include "sherpa-display.h" // NOLINT
|
||||
#include "sherpa-onnx/c-api/cxx-api.h"
|
||||
#include "sherpa-onnx/csrc/microphone.h"
|
||||
|
||||
std::queue<std::vector<float>> samples_queue;
|
||||
std::condition_variable condition_variable;
|
||||
std::mutex mutex;
|
||||
bool stop = false;
|
||||
|
||||
static void Handler(int32_t /*sig*/) {
|
||||
stop = true;
|
||||
condition_variable.notify_one();
|
||||
fprintf(stderr, "\nCaught Ctrl + C. Exiting...\n");
|
||||
}
|
||||
|
||||
static int32_t RecordCallback(const void *input_buffer,
|
||||
void * /*output_buffer*/,
|
||||
unsigned long frames_per_buffer, // NOLINT
|
||||
const PaStreamCallbackTimeInfo * /*time_info*/,
|
||||
PaStreamCallbackFlags /*status_flags*/,
|
||||
void * /*user_data*/) {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
samples_queue.emplace(
|
||||
reinterpret_cast<const float *>(input_buffer),
|
||||
reinterpret_cast<const float *>(input_buffer) + frames_per_buffer);
|
||||
condition_variable.notify_one();
|
||||
|
||||
return stop ? paComplete : paContinue;
|
||||
}
|
||||
|
||||
static sherpa_onnx::cxx::VoiceActivityDetector CreateVad() {
|
||||
using namespace sherpa_onnx::cxx; // NOLINT
|
||||
VadModelConfig config;
|
||||
config.silero_vad.model = "./silero_vad.onnx";
|
||||
config.silero_vad.threshold = 0.5;
|
||||
config.silero_vad.min_silence_duration = 0.1;
|
||||
config.silero_vad.min_speech_duration = 0.25;
|
||||
config.silero_vad.max_speech_duration = 8;
|
||||
config.sample_rate = 16000;
|
||||
config.debug = false;
|
||||
|
||||
VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
|
||||
if (!vad.Get()) {
|
||||
std::cerr << "Failed to create VAD. Please check your config\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
return vad;
|
||||
}
|
||||
|
||||
static sherpa_onnx::cxx::OfflineRecognizer CreateOfflineRecognizer() {
|
||||
using namespace sherpa_onnx::cxx; // NOLINT
|
||||
OfflineRecognizerConfig config;
|
||||
|
||||
config.model_config.zipformer_ctc.model =
|
||||
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
|
||||
config.model_config.tokens =
|
||||
"./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
|
||||
|
||||
config.model_config.num_threads = 2;
|
||||
config.model_config.debug = false;
|
||||
|
||||
std::cout << "Loading model\n";
|
||||
OfflineRecognizer recognizer = OfflineRecognizer::Create(config);
|
||||
if (!recognizer.Get()) {
|
||||
std::cerr << "Please check your config\n";
|
||||
exit(-1);
|
||||
}
|
||||
std::cout << "Loading model done\n";
|
||||
return recognizer;
|
||||
}
|
||||
|
||||
int32_t main() {
|
||||
signal(SIGINT, Handler);
|
||||
|
||||
using namespace sherpa_onnx::cxx; // NOLINT
|
||||
|
||||
auto vad = CreateVad();
|
||||
auto recognizer = CreateOfflineRecognizer();
|
||||
|
||||
sherpa_onnx::Microphone mic;
|
||||
|
||||
PaDeviceIndex num_devices = Pa_GetDeviceCount();
|
||||
if (num_devices == 0) {
|
||||
std::cerr << " If you are using Linux, please try "
|
||||
"./build/bin/zipformer-ctc-simulate-streaming-alsa-cxx-api\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t device_index = Pa_GetDefaultInputDevice();
|
||||
const char *pDeviceIndex = std::getenv("SHERPA_ONNX_MIC_DEVICE");
|
||||
if (pDeviceIndex) {
|
||||
fprintf(stderr, "Use specified device: %s\n", pDeviceIndex);
|
||||
device_index = atoi(pDeviceIndex);
|
||||
}
|
||||
mic.PrintDevices(device_index);
|
||||
|
||||
float mic_sample_rate = 16000;
|
||||
const char *sample_rate_str = std::getenv("SHERPA_ONNX_MIC_SAMPLE_RATE");
|
||||
if (sample_rate_str) {
|
||||
fprintf(stderr, "Use sample rate %f for mic\n", mic_sample_rate);
|
||||
mic_sample_rate = atof(sample_rate_str);
|
||||
}
|
||||
float sample_rate = 16000;
|
||||
LinearResampler resampler;
|
||||
if (mic_sample_rate != sample_rate) {
|
||||
float min_freq = std::min(mic_sample_rate, sample_rate);
|
||||
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
|
||||
|
||||
int32_t lowpass_filter_width = 6;
|
||||
resampler = LinearResampler::Create(mic_sample_rate, sample_rate,
|
||||
lowpass_cutoff, lowpass_filter_width);
|
||||
}
|
||||
if (mic.OpenDevice(device_index, mic_sample_rate, 1, RecordCallback,
|
||||
nullptr) == false) {
|
||||
std::cerr << "Failed to open microphone device\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t window_size = 512; // samples, please don't change
|
||||
|
||||
int32_t offset = 0;
|
||||
std::vector<float> buffer;
|
||||
bool speech_started = false;
|
||||
|
||||
auto started_time = std::chrono::steady_clock::now();
|
||||
|
||||
SherpaDisplay display;
|
||||
|
||||
std::cout << "Started! Please speak\n";
|
||||
|
||||
while (!stop) {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
while (samples_queue.empty() && !stop) {
|
||||
condition_variable.wait(lock);
|
||||
}
|
||||
|
||||
const auto &s = samples_queue.front();
|
||||
if (!resampler.Get()) {
|
||||
buffer.insert(buffer.end(), s.begin(), s.end());
|
||||
} else {
|
||||
auto resampled = resampler.Resample(s.data(), s.size(), false);
|
||||
buffer.insert(buffer.end(), resampled.begin(), resampled.end());
|
||||
}
|
||||
|
||||
samples_queue.pop();
|
||||
}
|
||||
|
||||
for (; offset + window_size < buffer.size(); offset += window_size) {
|
||||
vad.AcceptWaveform(buffer.data() + offset, window_size);
|
||||
if (!speech_started && vad.IsDetected()) {
|
||||
speech_started = true;
|
||||
started_time = std::chrono::steady_clock::now();
|
||||
}
|
||||
}
|
||||
if (!speech_started) {
|
||||
if (buffer.size() > 10 * window_size) {
|
||||
offset -= buffer.size() - 10 * window_size;
|
||||
buffer = {buffer.end() - 10 * window_size, buffer.end()};
|
||||
}
|
||||
}
|
||||
|
||||
auto current_time = std::chrono::steady_clock::now();
|
||||
const float elapsed_seconds =
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(current_time -
|
||||
started_time)
|
||||
.count() /
|
||||
1000.;
|
||||
|
||||
if (speech_started && elapsed_seconds > 0.2) {
|
||||
OfflineStream stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(sample_rate, buffer.data(), buffer.size());
|
||||
|
||||
recognizer.Decode(&stream);
|
||||
|
||||
OfflineRecognizerResult result = recognizer.GetResult(&stream);
|
||||
display.UpdateText(result.text);
|
||||
display.Display();
|
||||
|
||||
started_time = std::chrono::steady_clock::now();
|
||||
}
|
||||
|
||||
while (!vad.IsEmpty()) {
|
||||
auto segment = vad.Front();
|
||||
|
||||
vad.Pop();
|
||||
|
||||
OfflineStream stream = recognizer.CreateStream();
|
||||
stream.AcceptWaveform(sample_rate, segment.samples.data(),
|
||||
segment.samples.size());
|
||||
|
||||
recognizer.Decode(&stream);
|
||||
|
||||
OfflineRecognizerResult result = recognizer.GetResult(&stream);
|
||||
|
||||
display.UpdateText(result.text);
|
||||
display.FinalizeCurrentSentence();
|
||||
display.Display();
|
||||
|
||||
buffer.clear();
|
||||
offset = 0;
|
||||
speech_started = false;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
52
dart-api-examples/non-streaming-asr/bin/zipformer-ctc.dart
Normal file
52
dart-api-examples/non-streaming-asr/bin/zipformer-ctc.dart
Normal file
@@ -0,0 +1,52 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:args/args.dart';
|
||||
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
|
||||
|
||||
import './init.dart';
|
||||
|
||||
void main(List<String> arguments) async {
|
||||
await initSherpaOnnx();
|
||||
|
||||
final parser = ArgParser()
|
||||
..addOption('model', help: 'Path to the Zipformer CTC model')
|
||||
..addOption('tokens', help: 'Path to tokens.txt')
|
||||
..addOption('input-wav', help: 'Path to input.wav to transcribe');
|
||||
|
||||
final res = parser.parse(arguments);
|
||||
if (res['model'] == null ||
|
||||
res['tokens'] == null ||
|
||||
res['input-wav'] == null) {
|
||||
print(parser.usage);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
final model = res['model'] as String;
|
||||
final tokens = res['tokens'] as String;
|
||||
final inputWav = res['input-wav'] as String;
|
||||
|
||||
final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model);
|
||||
|
||||
final modelConfig = sherpa_onnx.OfflineModelConfig(
|
||||
zipformerCtc: zipformerCtc,
|
||||
tokens: tokens,
|
||||
debug: true,
|
||||
numThreads: 1,
|
||||
);
|
||||
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
|
||||
final recognizer = sherpa_onnx.OfflineRecognizer(config);
|
||||
|
||||
final waveData = sherpa_onnx.readWave(inputWav);
|
||||
final stream = recognizer.createStream();
|
||||
|
||||
stream.acceptWaveform(
|
||||
samples: waveData.samples, sampleRate: waveData.sampleRate);
|
||||
recognizer.decode(stream);
|
||||
|
||||
final result = recognizer.getResult(stream);
|
||||
print(result.text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
}
|
||||
18
dart-api-examples/non-streaming-asr/run-zipformer-ctc.sh
Executable file
18
dart-api-examples/non-streaming-asr/run-zipformer-ctc.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
dart pub get
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
dart run \
|
||||
./bin/zipformer-ctc.dart \
|
||||
--model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
|
||||
--tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
|
||||
--input-wav ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav
|
||||
@@ -0,0 +1,118 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
import 'dart:io';
|
||||
import 'dart:typed_data';
|
||||
|
||||
import 'package:args/args.dart';
|
||||
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;
|
||||
|
||||
import './init.dart';
|
||||
|
||||
void main(List<String> arguments) async {
|
||||
await initSherpaOnnx();
|
||||
|
||||
final parser = ArgParser()
|
||||
..addOption('silero-vad', help: 'Path to silero_vad.onnx')
|
||||
..addOption('model', help: 'Path to the Zipformer CTC model')
|
||||
..addOption('tokens', help: 'Path to tokens.txt')
|
||||
..addOption('input-wav', help: 'Path to input.wav to transcribe');
|
||||
|
||||
final res = parser.parse(arguments);
|
||||
if (res['silero-vad'] == null ||
|
||||
res['model'] == null ||
|
||||
res['tokens'] == null ||
|
||||
res['input-wav'] == null) {
|
||||
print(parser.usage);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// create VAD
|
||||
final sileroVad = res['silero-vad'] as String;
|
||||
|
||||
final sileroVadConfig = sherpa_onnx.SileroVadModelConfig(
|
||||
model: sileroVad,
|
||||
minSilenceDuration: 0.25,
|
||||
minSpeechDuration: 0.5,
|
||||
maxSpeechDuration: 5.0,
|
||||
);
|
||||
|
||||
final vadConfig = sherpa_onnx.VadModelConfig(
|
||||
sileroVad: sileroVadConfig,
|
||||
numThreads: 1,
|
||||
debug: true,
|
||||
);
|
||||
|
||||
final vad = sherpa_onnx.VoiceActivityDetector(
|
||||
config: vadConfig, bufferSizeInSeconds: 10);
|
||||
|
||||
// create offline recognizer
|
||||
final model = res['model'] as String;
|
||||
final tokens = res['tokens'] as String;
|
||||
final inputWav = res['input-wav'] as String;
|
||||
|
||||
final zipformerCtc = sherpa_onnx.OfflineZipformerCtcModelConfig(model: model);
|
||||
|
||||
final modelConfig = sherpa_onnx.OfflineModelConfig(
|
||||
zipformerCtc: zipformerCtc,
|
||||
tokens: tokens,
|
||||
debug: true,
|
||||
numThreads: 1,
|
||||
);
|
||||
final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig);
|
||||
final recognizer = sherpa_onnx.OfflineRecognizer(config);
|
||||
|
||||
final waveData = sherpa_onnx.readWave(inputWav);
|
||||
if (waveData.sampleRate != 16000) {
|
||||
print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}');
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int numSamples = waveData.samples.length;
|
||||
int numIter = numSamples ~/ vadConfig.sileroVad.windowSize;
|
||||
|
||||
for (int i = 0; i != numIter; ++i) {
|
||||
int start = i * vadConfig.sileroVad.windowSize;
|
||||
vad.acceptWaveform(Float32List.sublistView(
|
||||
waveData.samples, start, start + vadConfig.sileroVad.windowSize));
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
final samples = vad.front().samples;
|
||||
final startTime = vad.front().start.toDouble() / waveData.sampleRate;
|
||||
final endTime =
|
||||
startTime + samples.length.toDouble() / waveData.sampleRate;
|
||||
|
||||
final stream = recognizer.createStream();
|
||||
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
|
||||
recognizer.decode(stream);
|
||||
|
||||
final result = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
print(
|
||||
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
|
||||
|
||||
vad.pop();
|
||||
}
|
||||
}
|
||||
|
||||
vad.flush();
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
final samples = vad.front().samples;
|
||||
final startTime = vad.front().start.toDouble() / waveData.sampleRate;
|
||||
final endTime = startTime + samples.length.toDouble() / waveData.sampleRate;
|
||||
|
||||
final stream = recognizer.createStream();
|
||||
stream.acceptWaveform(samples: samples, sampleRate: waveData.sampleRate);
|
||||
recognizer.decode(stream);
|
||||
|
||||
final result = recognizer.getResult(stream);
|
||||
stream.free();
|
||||
print(
|
||||
'${startTime.toStringAsPrecision(5)} -- ${endTime.toStringAsPrecision(5)} : ${result.text}');
|
||||
|
||||
vad.pop();
|
||||
}
|
||||
|
||||
vad.free();
|
||||
|
||||
recognizer.free();
|
||||
}
|
||||
27
dart-api-examples/vad-with-non-streaming-asr/run-zipformer-ctc.sh
Executable file
27
dart-api-examples/vad-with-non-streaming-asr/run-zipformer-ctc.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
dart pub get
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
if [ ! -f ./lei-jun-test.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
|
||||
fi
|
||||
|
||||
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
fi
|
||||
|
||||
dart run \
|
||||
./bin/zipformer-ctc.dart \
|
||||
--silero-vad ./silero_vad.onnx \
|
||||
--model ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
|
||||
--tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
|
||||
--input-wav ./lei-jun-test.wav
|
||||
@@ -75,6 +75,9 @@ class OfflineDecodeFiles
|
||||
[Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")]
|
||||
public string NeMoCtc { get; set; } = string.Empty;
|
||||
|
||||
[Option("zipformer-ctc", Required = false, HelpText = "Path to model.onnx. Used only for Zipformer CTC models")]
|
||||
public string ZipformerCtc { get; set; } = string.Empty;
|
||||
|
||||
[Option("dolphin-model", Required = false, Default = "", HelpText = "Path to dolphin ctc model")]
|
||||
public string DolphinModel { get; set; } = string.Empty;
|
||||
|
||||
@@ -240,6 +243,10 @@ to download pre-trained Tdnn models.
|
||||
{
|
||||
config.ModelConfig.Dolphin.Model = options.DolphinModel;
|
||||
}
|
||||
else if (!string.IsNullOrEmpty(options.ZipformerCtc))
|
||||
{
|
||||
config.ModelConfig.ZipformerCtc.Model = options.ZipformerCtc;
|
||||
}
|
||||
else if (!string.IsNullOrEmpty(options.TeleSpeechCtc))
|
||||
{
|
||||
config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc;
|
||||
|
||||
18
dotnet-examples/offline-decode-files/run-zipformer-ctc.sh
Executable file
18
dotnet-examples/offline-decode-files/run-zipformer-ctc.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
dotnet run \
|
||||
--tokens=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
|
||||
--zipformer-ctc=./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
|
||||
--num-threads=1 \
|
||||
--files ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav \
|
||||
./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/1.wav \
|
||||
./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/8k.wav
|
||||
@@ -104,6 +104,27 @@ class OfflineDolphinModelConfig {
|
||||
final String model;
|
||||
}
|
||||
|
||||
class OfflineZipformerCtcModelConfig {
|
||||
const OfflineZipformerCtcModelConfig({this.model = ''});
|
||||
|
||||
factory OfflineZipformerCtcModelConfig.fromJson(Map<String, dynamic> json) {
|
||||
return OfflineZipformerCtcModelConfig(
|
||||
model: json['model'] as String? ?? '',
|
||||
);
|
||||
}
|
||||
|
||||
@override
|
||||
String toString() {
|
||||
return 'OfflineZipformerCtcModelConfig(model: $model)';
|
||||
}
|
||||
|
||||
Map<String, dynamic> toJson() => {
|
||||
'model': model,
|
||||
};
|
||||
|
||||
final String model;
|
||||
}
|
||||
|
||||
class OfflineWhisperModelConfig {
|
||||
const OfflineWhisperModelConfig(
|
||||
{this.encoder = '',
|
||||
@@ -288,6 +309,7 @@ class OfflineModelConfig {
|
||||
this.moonshine = const OfflineMoonshineModelConfig(),
|
||||
this.fireRedAsr = const OfflineFireRedAsrModelConfig(),
|
||||
this.dolphin = const OfflineDolphinModelConfig(),
|
||||
this.zipformerCtc = const OfflineZipformerCtcModelConfig(),
|
||||
required this.tokens,
|
||||
this.numThreads = 1,
|
||||
this.debug = true,
|
||||
@@ -336,6 +358,10 @@ class OfflineModelConfig {
|
||||
? OfflineDolphinModelConfig.fromJson(
|
||||
json['dolphin'] as Map<String, dynamic>)
|
||||
: const OfflineDolphinModelConfig(),
|
||||
zipformerCtc: json['zipformerCtc'] != null
|
||||
? OfflineZipformerCtcModelConfig.fromJson(
|
||||
json['zipformerCtc'] as Map<String, dynamic>)
|
||||
: const OfflineZipformerCtcModelConfig(),
|
||||
tokens: json['tokens'] as String,
|
||||
numThreads: json['numThreads'] as int? ?? 1,
|
||||
debug: json['debug'] as bool? ?? true,
|
||||
@@ -349,7 +375,7 @@ class OfflineModelConfig {
|
||||
|
||||
@override
|
||||
String toString() {
|
||||
return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)';
|
||||
return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, senseVoice: $senseVoice, moonshine: $moonshine, fireRedAsr: $fireRedAsr, dolphin: $dolphin, zipformerCtc: $zipformerCtc, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)';
|
||||
}
|
||||
|
||||
Map<String, dynamic> toJson() => {
|
||||
@@ -362,6 +388,7 @@ class OfflineModelConfig {
|
||||
'moonshine': moonshine.toJson(),
|
||||
'fireRedAsr': fireRedAsr.toJson(),
|
||||
'dolphin': dolphin.toJson(),
|
||||
'zipformerCtc': zipformerCtc.toJson(),
|
||||
'tokens': tokens,
|
||||
'numThreads': numThreads,
|
||||
'debug': debug,
|
||||
@@ -381,6 +408,7 @@ class OfflineModelConfig {
|
||||
final OfflineMoonshineModelConfig moonshine;
|
||||
final OfflineFireRedAsrModelConfig fireRedAsr;
|
||||
final OfflineDolphinModelConfig dolphin;
|
||||
final OfflineZipformerCtcModelConfig zipformerCtc;
|
||||
|
||||
final String tokens;
|
||||
final int numThreads;
|
||||
@@ -578,6 +606,8 @@ class OfflineRecognizer {
|
||||
config.model.fireRedAsr.decoder.toNativeUtf8();
|
||||
|
||||
c.ref.model.dolphin.model = config.model.dolphin.model.toNativeUtf8();
|
||||
c.ref.model.zipformerCtc.model =
|
||||
config.model.zipformerCtc.model.toNativeUtf8();
|
||||
|
||||
c.ref.model.tokens = config.model.tokens.toNativeUtf8();
|
||||
|
||||
@@ -623,6 +653,7 @@ class OfflineRecognizer {
|
||||
calloc.free(c.ref.model.modelType);
|
||||
calloc.free(c.ref.model.provider);
|
||||
calloc.free(c.ref.model.tokens);
|
||||
calloc.free(c.ref.model.zipformerCtc.model);
|
||||
calloc.free(c.ref.model.dolphin.model);
|
||||
calloc.free(c.ref.model.fireRedAsr.decoder);
|
||||
calloc.free(c.ref.model.fireRedAsr.encoder);
|
||||
|
||||
@@ -266,6 +266,10 @@ final class SherpaOnnxOfflineDolphinModelConfig extends Struct {
|
||||
external Pointer<Utf8> model;
|
||||
}
|
||||
|
||||
final class SherpaOnnxOfflineZipformerCtcModelConfig extends Struct {
|
||||
external Pointer<Utf8> model;
|
||||
}
|
||||
|
||||
final class SherpaOnnxOfflineWhisperModelConfig extends Struct {
|
||||
external Pointer<Utf8> encoder;
|
||||
external Pointer<Utf8> decoder;
|
||||
@@ -333,6 +337,7 @@ final class SherpaOnnxOfflineModelConfig extends Struct {
|
||||
external SherpaOnnxOfflineMoonshineModelConfig moonshine;
|
||||
external SherpaOnnxOfflineFireRedAsrModelConfig fireRedAsr;
|
||||
external SherpaOnnxOfflineDolphinModelConfig dolphin;
|
||||
external SherpaOnnxOfflineZipformerCtcModelConfig zipformerCtc;
|
||||
}
|
||||
|
||||
final class SherpaOnnxOfflineRecognizerConfig extends Struct {
|
||||
|
||||
@@ -28,6 +28,8 @@ func main() {
|
||||
|
||||
flag.StringVar(&config.ModelConfig.NemoCTC.Model, "nemo-ctc", "", "Path to the NeMo CTC model")
|
||||
|
||||
flag.StringVar(&config.ModelConfig.ZipformerCtc.Model, "zipformer-ctc", "", "Path to the Zipformer CTC model")
|
||||
|
||||
flag.StringVar(&config.ModelConfig.Dolphin.Model, "dolphin-model", "", "Path to the Dolphin CTC model")
|
||||
|
||||
flag.StringVar(&config.ModelConfig.FireRedAsr.Encoder, "fire-red-asr-encoder", "", "Path to the FireRedAsr encoder model")
|
||||
|
||||
19
go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh
Executable file
19
go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
go mod tidy
|
||||
go build
|
||||
|
||||
./non-streaming-decode-files \
|
||||
--zipformer-ctc ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx \
|
||||
--tokens ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt \
|
||||
--debug 0 \
|
||||
./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav
|
||||
@@ -15,6 +15,7 @@ export { Samples,
|
||||
OfflineTdnnModelConfig,
|
||||
OfflineSenseVoiceModelConfig,
|
||||
OfflineMoonshineModelConfig,
|
||||
OfflineZipformerCtcModelConfig,
|
||||
OfflineModelConfig,
|
||||
OfflineLMConfig,
|
||||
OfflineRecognizerConfig,
|
||||
|
||||
@@ -45,7 +45,23 @@ static SherpaOnnxOfflineParaformerModelConfig GetOfflineParaformerModelConfig(
|
||||
return c;
|
||||
}
|
||||
|
||||
static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinfig(
|
||||
static SherpaOnnxOfflineZipformerCtcModelConfig
|
||||
GetOfflineZipformerCtcModelConfig(Napi::Object obj) {
|
||||
SherpaOnnxOfflineZipformerCtcModelConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
|
||||
if (!obj.Has("zipformerCtc") || !obj.Get("zipformerCtc").IsObject()) {
|
||||
return c;
|
||||
}
|
||||
|
||||
Napi::Object o = obj.Get("zipformerCtc").As<Napi::Object>();
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static SherpaOnnxOfflineDolphinModelConfig GetOfflineDolphinModelConfig(
|
||||
Napi::Object obj) {
|
||||
SherpaOnnxOfflineDolphinModelConfig c;
|
||||
memset(&c, 0, sizeof(c));
|
||||
@@ -185,7 +201,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
|
||||
c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
|
||||
c.moonshine = GetOfflineMoonshineModelConfig(o);
|
||||
c.fire_red_asr = GetOfflineFireRedAsrModelConfig(o);
|
||||
c.dolphin = GetOfflineDolphinfig(o);
|
||||
c.dolphin = GetOfflineDolphinModelConfig(o);
|
||||
c.zipformer_ctc = GetOfflineZipformerCtcModelConfig(o);
|
||||
|
||||
SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
|
||||
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
|
||||
@@ -312,6 +329,7 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.fire_red_asr.decoder);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.dolphin.model);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.zipformer_ctc.model);
|
||||
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.tokens);
|
||||
SHERPA_ONNX_DELETE_C_STR(c.model_config.provider);
|
||||
|
||||
@@ -55,6 +55,10 @@ export class OfflineDolphinModelConfig {
|
||||
public model: string = '';
|
||||
}
|
||||
|
||||
export class OfflineZipformerCtcModelConfig {
|
||||
public model: string = '';
|
||||
}
|
||||
|
||||
export class OfflineWhisperModelConfig {
|
||||
public encoder: string = '';
|
||||
public decoder: string = '';
|
||||
@@ -97,6 +101,7 @@ export class OfflineModelConfig {
|
||||
public senseVoice: OfflineSenseVoiceModelConfig = new OfflineSenseVoiceModelConfig();
|
||||
public moonshine: OfflineMoonshineModelConfig = new OfflineMoonshineModelConfig();
|
||||
public dolphin: OfflineDolphinModelConfig = new OfflineDolphinModelConfig();
|
||||
public zipformerCtc: OfflineZipformerCtcModelConfig = new OfflineZipformerCtcModelConfig();
|
||||
}
|
||||
|
||||
export class OfflineLMConfig {
|
||||
|
||||
50
java-api-examples/NonStreamingDecodeFileZipformerCtc.java
Normal file
50
java-api-examples/NonStreamingDecodeFileZipformerCtc.java
Normal file
@@ -0,0 +1,50 @@
|
||||
// Copyright 2025 Xiaomi Corporation
|
||||
|
||||
// This file shows how to use an offline Zipformer CTC model,
|
||||
// i.e., non-streaming Zipformer CTC model,
|
||||
// to decode files.
|
||||
import com.k2fsa.sherpa.onnx.*;
|
||||
|
||||
public class NonStreamingDecodeFileZipformerCtc {
|
||||
public static void main(String[] args) {
|
||||
// please refer to
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
// to download model files
|
||||
String model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx";
|
||||
String tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt";
|
||||
|
||||
String waveFilename = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav";
|
||||
|
||||
WaveReader reader = new WaveReader(waveFilename);
|
||||
|
||||
OfflineZipformerCtcModelConfig zipformerCtc =
|
||||
OfflineZipformerCtcModelConfig.builder().setModel(model).build();
|
||||
|
||||
OfflineModelConfig modelConfig =
|
||||
OfflineModelConfig.builder()
|
||||
.setZipformerCtc(zipformerCtc)
|
||||
.setTokens(tokens)
|
||||
.setNumThreads(1)
|
||||
.setDebug(true)
|
||||
.build();
|
||||
|
||||
OfflineRecognizerConfig config =
|
||||
OfflineRecognizerConfig.builder()
|
||||
.setOfflineModelConfig(modelConfig)
|
||||
.setDecodingMethod("greedy_search")
|
||||
.build();
|
||||
|
||||
OfflineRecognizer recognizer = new OfflineRecognizer(config);
|
||||
OfflineStream stream = recognizer.createStream();
|
||||
stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
|
||||
|
||||
recognizer.decode(stream);
|
||||
|
||||
String text = recognizer.getResult(stream).getText();
|
||||
|
||||
System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
|
||||
|
||||
stream.release();
|
||||
recognizer.release();
|
||||
}
|
||||
}
|
||||
38
java-api-examples/run-non-streaming-decode-file-zipformer-ctc.sh
Executable file
38
java-api-examples/run-non-streaming-decode-file-zipformer-ctc.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then
|
||||
mkdir -p ../build
|
||||
pushd ../build
|
||||
cmake \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_JNI=ON \
|
||||
..
|
||||
|
||||
make -j4
|
||||
ls -lh lib
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then
|
||||
pushd ../sherpa-onnx/java-api
|
||||
make
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
java \
|
||||
-Djava.library.path=$PWD/../build/lib \
|
||||
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \
|
||||
NonStreamingDecodeFileZipformerCtc.java
|
||||
@@ -253,6 +253,13 @@ function testOfflineAsr() {
|
||||
rm sherpa-onnx-zipformer-multi-zh-hans-2023-9-2.tar.bz2
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
out_filename=test_offline_asr.jar
|
||||
kotlinc-jvm -include-runtime -d $out_filename \
|
||||
test_offline_asr.kt \
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
fun main() {
|
||||
val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25)
|
||||
val types = arrayOf(0, 2, 5, 6, 15, 21, 24, 25, 31)
|
||||
for (type in types) {
|
||||
test(type)
|
||||
}
|
||||
@@ -19,6 +19,7 @@ fun test(type: Int) {
|
||||
21 -> "./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav"
|
||||
24 -> "./sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16/test_wavs/0.wav"
|
||||
25 -> "./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav"
|
||||
31 -> "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
|
||||
else -> null
|
||||
}
|
||||
|
||||
|
||||
@@ -123,6 +123,7 @@ The following tables list the examples in this folder.
|
||||
|[./test_asr_non_streaming_moonshine.js](./test_asr_non_streaming_moonshine.js)|Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine)|
|
||||
|[./test_vad_with_non_streaming_asr_moonshine.js](./test_vad_with_non_streaming_asr_moonshine.js)| Non-streaming speech recognition from a file using [Moonshine](https://github.com/usefulsensors/moonshine) + [Silero VAD](https://github.com/snakers4/silero-vad)|
|
||||
|[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
|
||||
|[./test_asr_non_streaming_zipformer_ctc.js](./test_asr_non_streaming_zipformer_ctc.js)|Non-streaming speech recognition from a file using a Zipformer CTC model with greedy search|
|
||||
|[./test_asr_non_streaming_nemo_parakeet_tdt_v2.js](./test_asr_non_streaming_nemo_parakeet_tdt_v2.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) [parakeet-tdt-0.6b-v2](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/nemo-transducer-models.html#sherpa-onnx-nemo-parakeet-tdt-0-6b-v2-int8-english) model with greedy search|
|
||||
|[./test_asr_non_streaming_dolphin_ctc.js](./test_asr_non_streaming_dolphin_ctc.js)|Non-streaming speech recognition from a file using a [Dolphinhttps://github.com/DataoceanAI/Dolphin]) CTC model with greedy search|
|
||||
|[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
|
||||
@@ -137,6 +138,7 @@ The following tables list the examples in this folder.
|
||||
|[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
|
||||
|[./test_vad_asr_non_streaming_moonshine_microphone.js](./test_vad_asr_non_streaming_moonshine_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Moonshine](https://github.com/usefulsensors/moonshine)|
|
||||
|[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
|
||||
|[./test_vad_asr_non_streaming_zipformer_ctc_microphone.js](./test_vad_asr_non_streaming_zipformer_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a Zipformer CTC model with greedy search|
|
||||
|[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
|
||||
|[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
|
||||
|
||||
@@ -372,6 +374,21 @@ rm sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2
|
||||
node ./test_asr_non_streaming_nemo_parakeet_tdt_v2.js
|
||||
```
|
||||
|
||||
### Non-streaming speech recognition with Zipformer CTC models
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
node ./test_asr_non_streaming_zipformer_ctc.js
|
||||
|
||||
# To run VAD + non-streaming ASR with Paraformer using a microphone
|
||||
npm install naudiodon2
|
||||
node ./test_vad_asr_non_streaming_zipformer_ctc_microphone.js
|
||||
```
|
||||
|
||||
### Non-streaming speech recognition with NeMo CTC models
|
||||
|
||||
```bash
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
const sherpa_onnx = require('sherpa-onnx-node');
|
||||
|
||||
// Please download test files from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
const config = {
|
||||
'featConfig': {
|
||||
'sampleRate': 16000,
|
||||
'featureDim': 80,
|
||||
},
|
||||
'modelConfig': {
|
||||
'zipformerCtc': {
|
||||
'model': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
|
||||
},
|
||||
'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
|
||||
'numThreads': 2,
|
||||
'provider': 'cpu',
|
||||
'debug': 1,
|
||||
}
|
||||
};
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
|
||||
|
||||
const recognizer = new sherpa_onnx.OfflineRecognizer(config);
|
||||
console.log('Started')
|
||||
let start = Date.now();
|
||||
const stream = recognizer.createStream();
|
||||
const wave = sherpa_onnx.readWave(waveFilename);
|
||||
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
|
||||
|
||||
recognizer.decode(stream);
|
||||
result = recognizer.getResult(stream)
|
||||
let stop = Date.now();
|
||||
console.log('Done')
|
||||
|
||||
const elapsed_seconds = (stop - start) / 1000;
|
||||
const duration = wave.samples.length / wave.sampleRate;
|
||||
const real_time_factor = elapsed_seconds / duration;
|
||||
console.log('Wave duration', duration.toFixed(3), 'seconds')
|
||||
console.log('Elapsed', elapsed_seconds.toFixed(3), 'seconds')
|
||||
console.log(
|
||||
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
|
||||
real_time_factor.toFixed(3))
|
||||
console.log(waveFilename)
|
||||
console.log('result\n', result)
|
||||
@@ -0,0 +1,109 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const portAudio = require('naudiodon2');
|
||||
// console.log(portAudio.getDevices());
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx-node');
|
||||
|
||||
function createRecognizer() {
|
||||
// Please download test files from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
const config = {
|
||||
'featConfig': {
|
||||
'sampleRate': 16000,
|
||||
'featureDim': 80,
|
||||
},
|
||||
'modelConfig': {
|
||||
'zipformerCtc': {
|
||||
'model':
|
||||
'./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
|
||||
},
|
||||
'tokens': './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
|
||||
'numThreads': 2,
|
||||
'provider': 'cpu',
|
||||
'debug': 1,
|
||||
}
|
||||
};
|
||||
|
||||
return new sherpa_onnx.OfflineRecognizer(config);
|
||||
}
|
||||
|
||||
function createVad() {
|
||||
// please download silero_vad.onnx from
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
const config = {
|
||||
sileroVad: {
|
||||
model: './silero_vad.onnx',
|
||||
threshold: 0.5,
|
||||
minSpeechDuration: 0.25,
|
||||
minSilenceDuration: 0.5,
|
||||
windowSize: 512,
|
||||
},
|
||||
sampleRate: 16000,
|
||||
debug: true,
|
||||
numThreads: 1,
|
||||
};
|
||||
|
||||
const bufferSizeInSeconds = 60;
|
||||
|
||||
return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
|
||||
}
|
||||
|
||||
const recognizer = createRecognizer();
|
||||
const vad = createVad();
|
||||
|
||||
const bufferSizeInSeconds = 30;
|
||||
const buffer =
|
||||
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
|
||||
|
||||
const ai = new portAudio.AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
closeOnError: true, // Close the stream if an audio error is detected, if
|
||||
// set false then just log the error
|
||||
deviceId: -1, // Use -1 or omit the deviceId to select the default device
|
||||
sampleFormat: portAudio.SampleFormatFloat32,
|
||||
sampleRate: vad.config.sampleRate
|
||||
}
|
||||
});
|
||||
|
||||
let printed = false;
|
||||
let index = 0;
|
||||
ai.on('data', data => {
|
||||
const windowSize = vad.config.sileroVad.windowSize;
|
||||
buffer.push(new Float32Array(data.buffer));
|
||||
while (buffer.size() > windowSize) {
|
||||
const samples = buffer.get(buffer.head(), windowSize);
|
||||
buffer.pop(windowSize);
|
||||
vad.acceptWaveform(samples);
|
||||
}
|
||||
|
||||
while (!vad.isEmpty()) {
|
||||
const segment = vad.front();
|
||||
vad.pop();
|
||||
const stream = recognizer.createStream();
|
||||
stream.acceptWaveform({
|
||||
samples: segment.samples,
|
||||
sampleRate: recognizer.config.featConfig.sampleRate
|
||||
});
|
||||
recognizer.decode(stream);
|
||||
const r = recognizer.getResult(stream);
|
||||
if (r.text.length > 0) {
|
||||
const text = r.text.toLowerCase().trim();
|
||||
console.log(`${index}: ${text}`);
|
||||
|
||||
const filename = `${index}-${text}-${
|
||||
new Date()
|
||||
.toLocaleTimeString('en-US', {hour12: false})
|
||||
.split(' ')[0]}.wav`;
|
||||
sherpa_onnx.writeWave(
|
||||
filename,
|
||||
{samples: segment.samples, sampleRate: vad.config.sampleRate});
|
||||
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ai.start();
|
||||
console.log('Started! Please speak')
|
||||
@@ -154,6 +154,23 @@ rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
|
||||
node ./test-offline-dolphin-ctc.js
|
||||
```
|
||||
|
||||
## ./test-offline-zipformer-ctc.js
|
||||
|
||||
[./test-offline-zipformer-ctc.js](./test-offline-zipformer-ctc.js) demonstrates
|
||||
how to decode a file with a Zipformer CTC model. In the code we use
|
||||
[sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese).
|
||||
|
||||
You can use the following command to run it:
|
||||
|
||||
```bash
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
node ./test-offline-zipformer-ctc.js
|
||||
```
|
||||
|
||||
## ./test-offline-nemo-ctc.js
|
||||
|
||||
[./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates
|
||||
|
||||
35
nodejs-examples/test-offline-zipformer-ctc.js
Normal file
35
nodejs-examples/test-offline-zipformer-ctc.js
Normal file
@@ -0,0 +1,35 @@
|
||||
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
//
|
||||
const fs = require('fs');
|
||||
const {Readable} = require('stream');
|
||||
const wav = require('wav');
|
||||
|
||||
const sherpa_onnx = require('sherpa-onnx');
|
||||
|
||||
function createOfflineRecognizer() {
|
||||
let config = {
|
||||
modelConfig: {
|
||||
zipformerCtc: {
|
||||
model: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx',
|
||||
},
|
||||
tokens: './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt',
|
||||
}
|
||||
};
|
||||
|
||||
return sherpa_onnx.createOfflineRecognizer(config);
|
||||
}
|
||||
|
||||
const recognizer = createOfflineRecognizer();
|
||||
const stream = recognizer.createStream();
|
||||
|
||||
const waveFilename =
|
||||
'./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
|
||||
const wave = sherpa_onnx.readWave(waveFilename);
|
||||
stream.acceptWaveform(wave.sampleRate, wave.samples);
|
||||
|
||||
recognizer.decode(stream);
|
||||
const text = recognizer.getResult(stream).text;
|
||||
console.log(text);
|
||||
|
||||
stream.free();
|
||||
recognizer.free();
|
||||
@@ -9,3 +9,4 @@ sense_voice
|
||||
telespeech_ctc
|
||||
moonshine
|
||||
dolphin_ctc
|
||||
zipformer_ctc
|
||||
|
||||
43
pascal-api-examples/non-streaming-asr/run-zipformer-ctc.sh
Executable file
43
pascal-api-examples/non-streaming-asr/run-zipformer-ctc.sh
Executable file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
ls -lh lib
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-dSHERPA_ONNX_USE_SHARED_LIBS \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./zipformer_ctc.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./zipformer_ctc
|
||||
76
pascal-api-examples/non-streaming-asr/zipformer_ctc.pas
Normal file
76
pascal-api-examples/non-streaming-asr/zipformer_ctc.pas
Normal file
@@ -0,0 +1,76 @@
|
||||
{ Copyright (c) 2025 Xiaomi Corporation }
|
||||
|
||||
{
|
||||
This file shows how to use a non-streaming Zipformer CTC model
|
||||
to decode files.
|
||||
|
||||
You can download the model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
}
|
||||
|
||||
program zipformer_ctc;
|
||||
|
||||
{$mode objfpc}
|
||||
|
||||
uses
|
||||
sherpa_onnx,
|
||||
DateUtils,
|
||||
SysUtils;
|
||||
|
||||
var
|
||||
Wave: TSherpaOnnxWave;
|
||||
WaveFilename: AnsiString;
|
||||
|
||||
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||
Stream: TSherpaOnnxOfflineStream;
|
||||
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||
|
||||
Start: TDateTime;
|
||||
Stop: TDateTime;
|
||||
|
||||
Elapsed: Single;
|
||||
Duration: Single;
|
||||
RealTimeFactor: Single;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
Config.ModelConfig.NumThreads := 1;
|
||||
Config.ModelConfig.Debug := False;
|
||||
|
||||
WaveFilename := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav';
|
||||
|
||||
Wave := SherpaOnnxReadWave(WaveFilename);
|
||||
|
||||
Recognizer := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||
Stream := Recognizer.CreateStream();
|
||||
Start := Now;
|
||||
|
||||
Stream.AcceptWaveform(Wave.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Stop := Now;
|
||||
|
||||
Elapsed := MilliSecondsBetween(Stop, Start) / 1000;
|
||||
Duration := Length(Wave.Samples) / Wave.SampleRate;
|
||||
RealTimeFactor := Elapsed / Duration;
|
||||
|
||||
WriteLn(RecognitionResult.ToString);
|
||||
WriteLn(Format('NumThreads %d', [Config.ModelConfig.NumThreads]));
|
||||
WriteLn(Format('Elapsed %.3f s', [Elapsed]));
|
||||
WriteLn(Format('Wave duration %.3f s', [Duration]));
|
||||
WriteLn(Format('RTF = %.3f/%.3f = %.3f', [Elapsed, Duration, RealTimeFactor]));
|
||||
|
||||
{Free resources to avoid memory leak.
|
||||
|
||||
Note: You don't need to invoke them for this simple script.
|
||||
However, you have to invoke them in your own large/complex project.
|
||||
}
|
||||
FreeAndNil(Stream);
|
||||
FreeAndNil(Recognizer);
|
||||
end.
|
||||
@@ -2,3 +2,5 @@
|
||||
vad_with_whisper
|
||||
vad_with_sense_voice
|
||||
vad_with_moonshine
|
||||
vad_with_zipformer_ctc
|
||||
vad_with_dolphin
|
||||
|
||||
50
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-zipformer-ctc.sh
Executable file
50
pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-zipformer-ctc.sh
Executable file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)
|
||||
|
||||
echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"
|
||||
|
||||
if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
|
||||
mkdir -p ../../build
|
||||
pushd ../../build
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./install \
|
||||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
|
||||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
|
||||
..
|
||||
|
||||
cmake --build . --target install --config Release
|
||||
popd
|
||||
fi
|
||||
|
||||
if [[ ! -f ./silero_vad.onnx ]]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
fi
|
||||
|
||||
if [ ! -f ./lei-jun-test.wav ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
fi
|
||||
|
||||
fpc \
|
||||
-dSHERPA_ONNX_USE_SHARED_LIBS \
|
||||
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
|
||||
-Fl$SHERPA_ONNX_DIR/build/install/lib \
|
||||
./vad_with_zipformer_ctc.pas
|
||||
|
||||
export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
|
||||
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH
|
||||
|
||||
./vad_with_zipformer_ctc
|
||||
@@ -0,0 +1,135 @@
|
||||
{ Copyright (c) 2025 Xiaomi Corporation }
|
||||
|
||||
{
|
||||
This file shows how to use a non-streaming Zipformer CTC model
|
||||
with silero VAD to decode files.
|
||||
|
||||
You can download the model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
}
|
||||
|
||||
program vad_with_zipformer_ctc;
|
||||
|
||||
{$mode objfpc}
|
||||
|
||||
uses
|
||||
sherpa_onnx,
|
||||
SysUtils;
|
||||
|
||||
function CreateVad(): TSherpaOnnxVoiceActivityDetector;
|
||||
var
|
||||
Config: TSherpaOnnxVadModelConfig;
|
||||
|
||||
SampleRate: Integer;
|
||||
WindowSize: Integer;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
SampleRate := 16000; {Please don't change it unless you know the details}
|
||||
WindowSize := 512; {Please don't change it unless you know the details}
|
||||
|
||||
Config.SileroVad.Model := './silero_vad.onnx';
|
||||
Config.SileroVad.MinSpeechDuration := 0.5;
|
||||
Config.SileroVad.MinSilenceDuration := 0.5;
|
||||
Config.SileroVad.Threshold := 0.5;
|
||||
Config.SileroVad.WindowSize := WindowSize;
|
||||
Config.NumThreads:= 1;
|
||||
Config.Debug:= True;
|
||||
Config.Provider:= 'cpu';
|
||||
Config.SampleRate := SampleRate;
|
||||
|
||||
Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30);
|
||||
end;
|
||||
|
||||
function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer;
|
||||
var
|
||||
Config: TSherpaOnnxOfflineRecognizerConfig;
|
||||
begin
|
||||
Initialize(Config);
|
||||
|
||||
Config.ModelConfig.ZipformerCtc.Model := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx';
|
||||
Config.ModelConfig.Tokens := './sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt';
|
||||
Config.ModelConfig.Provider := 'cpu';
|
||||
Config.ModelConfig.NumThreads := 1;
|
||||
Config.ModelConfig.Debug := False;
|
||||
|
||||
Result := TSherpaOnnxOfflineRecognizer.Create(Config);
|
||||
end;
|
||||
|
||||
var
|
||||
Wave: TSherpaOnnxWave;
|
||||
|
||||
Recognizer: TSherpaOnnxOfflineRecognizer;
|
||||
Vad: TSherpaOnnxVoiceActivityDetector;
|
||||
|
||||
Offset: Integer;
|
||||
WindowSize: Integer;
|
||||
SpeechSegment: TSherpaOnnxSpeechSegment;
|
||||
|
||||
Start: Single;
|
||||
Duration: Single;
|
||||
|
||||
Stream: TSherpaOnnxOfflineStream;
|
||||
RecognitionResult: TSherpaOnnxOfflineRecognizerResult;
|
||||
begin
|
||||
Vad := CreateVad();
|
||||
Recognizer := CreateOfflineRecognizer();
|
||||
|
||||
Wave := SherpaOnnxReadWave('./lei-jun-test.wav');
|
||||
if Wave.SampleRate <> Vad.Config.SampleRate then
|
||||
begin
|
||||
WriteLn(Format('Expected sample rate: %d. Given: %d',
|
||||
[Vad.Config.SampleRate, Wave.SampleRate]));
|
||||
|
||||
Exit;
|
||||
end;
|
||||
|
||||
WindowSize := Vad.Config.SileroVad.WindowSize;
|
||||
Offset := 0;
|
||||
while Offset + WindowSize <= Length(Wave.Samples) do
|
||||
begin
|
||||
Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize);
|
||||
Offset += WindowSize;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
end;
|
||||
|
||||
Vad.Flush;
|
||||
|
||||
while not Vad.IsEmpty do
|
||||
begin
|
||||
SpeechSegment := Vad.Front();
|
||||
Vad.Pop();
|
||||
Stream := Recognizer.CreateStream();
|
||||
|
||||
Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate);
|
||||
Recognizer.Decode(Stream);
|
||||
RecognitionResult := Recognizer.GetResult(Stream);
|
||||
|
||||
Start := SpeechSegment.Start / Wave.SampleRate;
|
||||
Duration := Length(SpeechSegment.Samples) / Wave.SampleRate;
|
||||
WriteLn(Format('%.3f -- %.3f %s',
|
||||
[Start, Start + Duration, RecognitionResult.Text]));
|
||||
|
||||
FreeAndNil(Stream);
|
||||
end;
|
||||
|
||||
FreeAndNil(Recognizer);
|
||||
FreeAndNil(Vad);
|
||||
end.
|
||||
56
python-api-examples/offline-zipformer-ctc-decode-files.py
Executable file
56
python-api-examples/offline-zipformer-ctc-decode-files.py
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
This file shows how to use a non-streaming zipformer CTC model from icefall
|
||||
to decode files.
|
||||
|
||||
Please download model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import sherpa_onnx
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
def create_recognizer():
|
||||
model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"
|
||||
tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"
|
||||
test_wav = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
|
||||
|
||||
if not Path(model).is_file() or not Path(test_wav).is_file():
|
||||
raise ValueError(
|
||||
"""Please download model files from
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
|
||||
"""
|
||||
)
|
||||
return (
|
||||
sherpa_onnx.OfflineRecognizer.from_zipformer_ctc(
|
||||
model=model,
|
||||
tokens=tokens,
|
||||
debug=True,
|
||||
),
|
||||
test_wav,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
recognizer, wave_filename = create_recognizer()
|
||||
|
||||
audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
|
||||
audio = audio[:, 0] # only use the first channel
|
||||
|
||||
# audio is a 1-D float32 numpy array normalized to the range [-1, 1]
|
||||
# sample_rate does not need to be 16000 Hz
|
||||
|
||||
stream = recognizer.create_stream()
|
||||
stream.accept_waveform(sample_rate, audio)
|
||||
recognizer.decode_stream(stream)
|
||||
print(wave_filename)
|
||||
print(stream.result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -344,7 +344,7 @@ def get_models():
|
||||
""",
|
||||
),
|
||||
Model(
|
||||
model_name="sherpa-onnx-streaming-zipformer-ctc-fp16-zh-2025-06-30",
|
||||
model_name="sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30",
|
||||
idx=19,
|
||||
lang="zh",
|
||||
short_name="large_zipformer_fp16",
|
||||
@@ -360,6 +360,26 @@ def get_models():
|
||||
|
||||
ls -lh
|
||||
|
||||
popd
|
||||
""",
|
||||
),
|
||||
Model(
|
||||
model_name="sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30",
|
||||
idx=20,
|
||||
lang="zh",
|
||||
short_name="large_zipformer_int8",
|
||||
rule_fsts="itn_zh_number.fst",
|
||||
cmd="""
|
||||
if [ ! -f itn_zh_number.fst ]; then
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst
|
||||
fi
|
||||
pushd $model_name
|
||||
rm -fv bpe.model
|
||||
|
||||
rm -rf test_wavs
|
||||
|
||||
ls -lh
|
||||
|
||||
popd
|
||||
""",
|
||||
),
|
||||
|
||||
@@ -548,6 +548,23 @@ def get_models():
|
||||
|
||||
ls -lh
|
||||
|
||||
popd
|
||||
""",
|
||||
),
|
||||
Model(
|
||||
model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03",
|
||||
idx=31,
|
||||
lang="zh",
|
||||
lang2="Chinese",
|
||||
short_name="zipformer_2025_07_03",
|
||||
cmd="""
|
||||
pushd $model_name
|
||||
|
||||
rm -rfv test_wavs
|
||||
rm -rfv bbpe.model
|
||||
|
||||
ls -lh
|
||||
|
||||
popd
|
||||
""",
|
||||
),
|
||||
|
||||
@@ -27,6 +27,7 @@ namespace SherpaOnnx
|
||||
Moonshine = new OfflineMoonshineModelConfig();
|
||||
FireRedAsr = new OfflineFireRedAsrModelConfig();
|
||||
Dolphin = new OfflineDolphinModelConfig();
|
||||
ZipformerCtc = new OfflineZipformerCtcModelConfig();
|
||||
}
|
||||
public OfflineTransducerModelConfig Transducer;
|
||||
public OfflineParaformerModelConfig Paraformer;
|
||||
@@ -60,5 +61,6 @@ namespace SherpaOnnx
|
||||
public OfflineMoonshineModelConfig Moonshine;
|
||||
public OfflineFireRedAsrModelConfig FireRedAsr;
|
||||
public OfflineDolphinModelConfig Dolphin;
|
||||
public OfflineZipformerCtcModelConfig ZipformerCtc;
|
||||
}
|
||||
}
|
||||
|
||||
18
scripts/dotnet/OfflineZipformerCtcModelConfig.cs
Normal file
18
scripts/dotnet/OfflineZipformerCtcModelConfig.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace SherpaOnnx
|
||||
{
|
||||
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct OfflineZipformerCtcModelConfig
|
||||
{
|
||||
public OfflineZipformerCtcModelConfig()
|
||||
{
|
||||
Model = "";
|
||||
}
|
||||
[MarshalAs(UnmanagedType.LPStr)]
|
||||
public string Model;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
../../../../go-api-examples/non-streaming-decode-files/run-zipformer-ctc.sh
|
||||
@@ -398,6 +398,10 @@ type OfflineNemoEncDecCtcModelConfig struct {
|
||||
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
|
||||
}
|
||||
|
||||
type OfflineZipformerCtcModelConfig struct {
|
||||
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
|
||||
}
|
||||
|
||||
type OfflineDolphinModelConfig struct {
|
||||
Model string // Path to the model, e.g., model.onnx or model.int8.onnx
|
||||
}
|
||||
@@ -448,6 +452,7 @@ type OfflineModelConfig struct {
|
||||
Moonshine OfflineMoonshineModelConfig
|
||||
FireRedAsr OfflineFireRedAsrModelConfig
|
||||
Dolphin OfflineDolphinModelConfig
|
||||
ZipformerCtc OfflineZipformerCtcModelConfig
|
||||
Tokens string // Path to tokens.txt
|
||||
|
||||
// Number of threads to use for neural network computation
|
||||
@@ -540,6 +545,7 @@ func newCOfflineRecognizerConfig(config *OfflineRecognizerConfig) *C.struct_Sher
|
||||
c.model_config.fire_red_asr.decoder = C.CString(config.ModelConfig.FireRedAsr.Decoder)
|
||||
|
||||
c.model_config.dolphin.model = C.CString(config.ModelConfig.Dolphin.Model)
|
||||
c.model_config.zipformer_ctc.model = C.CString(config.ModelConfig.ZipformerCtc.Model)
|
||||
|
||||
c.model_config.tokens = C.CString(config.ModelConfig.Tokens)
|
||||
|
||||
@@ -653,11 +659,22 @@ func freeCOfflineRecognizerConfig(c *C.struct_SherpaOnnxOfflineRecognizerConfig)
|
||||
C.free(unsafe.Pointer(c.model_config.fire_red_asr.encoder))
|
||||
c.model_config.fire_red_asr.encoder = nil
|
||||
}
|
||||
|
||||
if c.model_config.fire_red_asr.decoder != nil {
|
||||
C.free(unsafe.Pointer(c.model_config.fire_red_asr.decoder))
|
||||
c.model_config.fire_red_asr.decoder = nil
|
||||
}
|
||||
|
||||
if c.model_config.dolphin.model != nil {
|
||||
C.free(unsafe.Pointer(c.model_config.dolphin.model))
|
||||
c.model_config.dolphin.model = nil
|
||||
}
|
||||
|
||||
if c.model_config.zipformer_ctc.model != nil {
|
||||
C.free(unsafe.Pointer(c.model_config.zipformer_ctc.model))
|
||||
c.model_config.zipformer_ctc.model = nil
|
||||
}
|
||||
|
||||
if c.model_config.tokens != nil {
|
||||
C.free(unsafe.Pointer(c.model_config.tokens))
|
||||
c.model_config.tokens = nil
|
||||
|
||||
@@ -212,6 +212,21 @@ def get_models():
|
||||
git diff
|
||||
""",
|
||||
),
|
||||
Model(
|
||||
model_name="sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03",
|
||||
hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc",
|
||||
ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-ctc",
|
||||
short_name="vad-asr-zh-zipformer-ctc",
|
||||
cmd="""
|
||||
pushd $model_name
|
||||
mv model.int8.onnx ../zipformer-ctc.onnx
|
||||
mv tokens.txt ../
|
||||
popd
|
||||
rm -rf $model_name
|
||||
sed -i.bak 's/Zipformer/Zipformer CTC supporting Chinese 中文/g' ../index.html
|
||||
git diff
|
||||
""",
|
||||
),
|
||||
]
|
||||
return models
|
||||
|
||||
|
||||
@@ -484,6 +484,9 @@ static sherpa_onnx::OfflineRecognizerConfig GetOfflineRecognizerConfig(
|
||||
recognizer_config.model_config.dolphin.model =
|
||||
SHERPA_ONNX_OR(config->model_config.dolphin.model, "");
|
||||
|
||||
recognizer_config.model_config.zipformer_ctc.model =
|
||||
SHERPA_ONNX_OR(config->model_config.zipformer_ctc.model, "");
|
||||
|
||||
recognizer_config.lm_config.model =
|
||||
SHERPA_ONNX_OR(config->lm_config.model, "");
|
||||
recognizer_config.lm_config.scale =
|
||||
|
||||
@@ -451,6 +451,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineDolphinModelConfig {
|
||||
const char *model;
|
||||
} SherpaOnnxOfflineDolphinModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineZipformerCtcModelConfig {
|
||||
const char *model;
|
||||
} SherpaOnnxOfflineZipformerCtcModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
|
||||
SherpaOnnxOfflineTransducerModelConfig transducer;
|
||||
SherpaOnnxOfflineParaformerModelConfig paraformer;
|
||||
@@ -474,6 +478,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
|
||||
SherpaOnnxOfflineMoonshineModelConfig moonshine;
|
||||
SherpaOnnxOfflineFireRedAsrModelConfig fire_red_asr;
|
||||
SherpaOnnxOfflineDolphinModelConfig dolphin;
|
||||
SherpaOnnxOfflineZipformerCtcModelConfig zipformer_ctc;
|
||||
} SherpaOnnxOfflineModelConfig;
|
||||
|
||||
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {
|
||||
|
||||
@@ -252,6 +252,9 @@ OfflineRecognizer OfflineRecognizer::Create(
|
||||
|
||||
c.model_config.dolphin.model = config.model_config.dolphin.model.c_str();
|
||||
|
||||
c.model_config.zipformer_ctc.model =
|
||||
config.model_config.zipformer_ctc.model.c_str();
|
||||
|
||||
c.lm_config.model = config.lm_config.model.c_str();
|
||||
c.lm_config.scale = config.lm_config.scale;
|
||||
|
||||
|
||||
@@ -241,6 +241,10 @@ struct SHERPA_ONNX_API OfflineDolphinModelConfig {
|
||||
std::string model;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineZipformerCtcModelConfig {
|
||||
std::string model;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineMoonshineModelConfig {
|
||||
std::string preprocessor;
|
||||
std::string encoder;
|
||||
@@ -267,6 +271,7 @@ struct SHERPA_ONNX_API OfflineModelConfig {
|
||||
OfflineMoonshineModelConfig moonshine;
|
||||
OfflineFireRedAsrModelConfig fire_red_asr;
|
||||
OfflineDolphinModelConfig dolphin;
|
||||
OfflineZipformerCtcModelConfig zipformer_ctc;
|
||||
};
|
||||
|
||||
struct SHERPA_ONNX_API OfflineLMConfig {
|
||||
|
||||
@@ -113,6 +113,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
|
||||
const OfflineModelConfig &config) {
|
||||
if (!config.dolphin.model.empty()) {
|
||||
return std::make_unique<OfflineDolphinModel>(config);
|
||||
} else if (!config.nemo_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineNemoEncDecCtcModel>(config);
|
||||
} else if (!config.tdnn.model.empty()) {
|
||||
return std::make_unique<OfflineTdnnCtcModel>(config);
|
||||
} else if (!config.zipformer_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineZipformerCtcModel>(config);
|
||||
} else if (!config.wenet_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineWenetCtcModel>(config);
|
||||
} else if (!config.telespeech_ctc.empty()) {
|
||||
return std::make_unique<OfflineTeleSpeechCtcModel>(config);
|
||||
}
|
||||
|
||||
// TODO(fangjun): Refactor it. We don't need to use model_type here
|
||||
@@ -167,6 +177,16 @@ std::unique_ptr<OfflineCtcModel> OfflineCtcModel::Create(
|
||||
Manager *mgr, const OfflineModelConfig &config) {
|
||||
if (!config.dolphin.model.empty()) {
|
||||
return std::make_unique<OfflineDolphinModel>(mgr, config);
|
||||
} else if (!config.nemo_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineNemoEncDecCtcModel>(mgr, config);
|
||||
} else if (!config.tdnn.model.empty()) {
|
||||
return std::make_unique<OfflineTdnnCtcModel>(mgr, config);
|
||||
} else if (!config.zipformer_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineZipformerCtcModel>(mgr, config);
|
||||
} else if (!config.wenet_ctc.model.empty()) {
|
||||
return std::make_unique<OfflineWenetCtcModel>(mgr, config);
|
||||
} else if (!config.telespeech_ctc.empty()) {
|
||||
return std::make_unique<OfflineTeleSpeechCtcModel>(mgr, config);
|
||||
}
|
||||
|
||||
// TODO(fangjun): Refactor it. We don't need to use model_type here
|
||||
|
||||
@@ -33,6 +33,7 @@ java_files += OfflineWhisperModelConfig.java
|
||||
java_files += OfflineFireRedAsrModelConfig.java
|
||||
java_files += OfflineMoonshineModelConfig.java
|
||||
java_files += OfflineNemoEncDecCtcModelConfig.java
|
||||
java_files += OfflineZipformerCtcModelConfig.java
|
||||
java_files += OfflineSenseVoiceModelConfig.java
|
||||
java_files += OfflineDolphinModelConfig.java
|
||||
java_files += OfflineModelConfig.java
|
||||
|
||||
@@ -11,6 +11,7 @@ public class OfflineModelConfig {
|
||||
private final OfflineNemoEncDecCtcModelConfig nemo;
|
||||
private final OfflineSenseVoiceModelConfig senseVoice;
|
||||
private final OfflineDolphinModelConfig dolphin;
|
||||
private final OfflineZipformerCtcModelConfig zipformerCtc;
|
||||
private final String teleSpeech;
|
||||
private final String tokens;
|
||||
private final int numThreads;
|
||||
@@ -28,6 +29,7 @@ public class OfflineModelConfig {
|
||||
this.fireRedAsr = builder.fireRedAsr;
|
||||
this.moonshine = builder.moonshine;
|
||||
this.nemo = builder.nemo;
|
||||
this.zipformerCtc = builder.zipformerCtc;
|
||||
this.senseVoice = builder.senseVoice;
|
||||
this.dolphin = builder.dolphin;
|
||||
this.teleSpeech = builder.teleSpeech;
|
||||
@@ -52,7 +54,7 @@ public class OfflineModelConfig {
|
||||
return transducer;
|
||||
}
|
||||
|
||||
public OfflineWhisperModelConfig getZipformer2Ctc() {
|
||||
public OfflineWhisperModelConfig getWhisper() {
|
||||
return whisper;
|
||||
}
|
||||
|
||||
@@ -68,6 +70,14 @@ public class OfflineModelConfig {
|
||||
return dolphin;
|
||||
}
|
||||
|
||||
public OfflineNemoEncDecCtcModelConfig getNemo() {
|
||||
return nemo;
|
||||
}
|
||||
|
||||
public OfflineZipformerCtcModelConfig getZipformerCtc() {
|
||||
return zipformerCtc;
|
||||
}
|
||||
|
||||
public String getTokens() {
|
||||
return tokens;
|
||||
}
|
||||
@@ -109,6 +119,7 @@ public class OfflineModelConfig {
|
||||
private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build();
|
||||
private OfflineSenseVoiceModelConfig senseVoice = OfflineSenseVoiceModelConfig.builder().build();
|
||||
private OfflineDolphinModelConfig dolphin = OfflineDolphinModelConfig.builder().build();
|
||||
private OfflineZipformerCtcModelConfig zipformerCtc = OfflineZipformerCtcModelConfig.builder().build();
|
||||
private String teleSpeech = "";
|
||||
private String tokens = "";
|
||||
private int numThreads = 1;
|
||||
@@ -142,6 +153,11 @@ public class OfflineModelConfig {
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setZipformerCtc(OfflineZipformerCtcModelConfig zipformerCtc) {
|
||||
this.zipformerCtc = zipformerCtc;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setTeleSpeech(String teleSpeech) {
|
||||
this.teleSpeech = teleSpeech;
|
||||
return this;
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
// Copyright 2025 Xiaomi Corporation
|
||||
|
||||
package com.k2fsa.sherpa.onnx;
|
||||
|
||||
public class OfflineZipformerCtcModelConfig {
|
||||
private final String model;
|
||||
|
||||
private OfflineZipformerCtcModelConfig(Builder builder) {
|
||||
this.model = builder.model;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public String getModel() {
|
||||
return model;
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
private String model = "";
|
||||
|
||||
public OfflineZipformerCtcModelConfig build() {
|
||||
return new OfflineZipformerCtcModelConfig(this);
|
||||
}
|
||||
|
||||
public Builder setModel(String model) {
|
||||
this.model = model;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -269,6 +269,21 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) {
|
||||
ans.model_config.nemo_ctc.model = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
// zipformer ctc
|
||||
fid =
|
||||
env->GetFieldID(model_config_cls, "zipformerCtc",
|
||||
"Lcom/k2fsa/sherpa/onnx/OfflineZipformerCtcModelConfig;");
|
||||
jobject zipformer_ctc_config = env->GetObjectField(model_config, fid);
|
||||
jclass zipformer_ctc_config_cls = env->GetObjectClass(zipformer_ctc_config);
|
||||
|
||||
fid =
|
||||
env->GetFieldID(zipformer_ctc_config_cls, "model", "Ljava/lang/String;");
|
||||
|
||||
s = (jstring)env->GetObjectField(zipformer_ctc_config, fid);
|
||||
p = env->GetStringUTFChars(s, nullptr);
|
||||
ans.model_config.zipformer_ctc.model = p;
|
||||
env->ReleaseStringUTFChars(s, p);
|
||||
|
||||
// dolphin
|
||||
fid = env->GetFieldID(model_config_cls, "dolphin",
|
||||
"Lcom/k2fsa/sherpa/onnx/OfflineDolphinModelConfig;");
|
||||
|
||||
@@ -29,6 +29,10 @@ data class OfflineDolphinModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class OfflineZipformerCtcModelConfig(
|
||||
var model: String = "",
|
||||
)
|
||||
|
||||
data class OfflineWhisperModelConfig(
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
@@ -64,6 +68,7 @@ data class OfflineModelConfig(
|
||||
var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(),
|
||||
var senseVoice: OfflineSenseVoiceModelConfig = OfflineSenseVoiceModelConfig(),
|
||||
var dolphin: OfflineDolphinModelConfig = OfflineDolphinModelConfig(),
|
||||
var zipformerCtc: OfflineZipformerCtcModelConfig = OfflineZipformerCtcModelConfig(),
|
||||
var teleSpeech: String = "",
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
@@ -559,6 +564,16 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? {
|
||||
modelType = "nemo_transducer",
|
||||
)
|
||||
}
|
||||
|
||||
31 -> {
|
||||
val modelDir = "sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03"
|
||||
return OfflineModelConfig(
|
||||
zipformerCtc = OfflineZipformerCtcModelConfig(
|
||||
model = "$modelDir/model.int8.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
)
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
@@ -412,6 +412,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
|
||||
model = "$modelDir/model.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -422,6 +423,7 @@ fun getModelConfig(type: Int): OnlineModelConfig? {
|
||||
model = "$modelDir/model.fp16.onnx",
|
||||
),
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
modelType = "zipformer2",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -284,6 +284,11 @@ type
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
|
||||
TSherpaOnnxOfflineZipformerCtcModelConfig = record
|
||||
Model: AnsiString;
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
|
||||
TSherpaOnnxOfflineWhisperModelConfig = record
|
||||
Encoder: AnsiString;
|
||||
Decoder: AnsiString;
|
||||
@@ -346,6 +351,7 @@ type
|
||||
Moonshine: TSherpaOnnxOfflineMoonshineModelConfig;
|
||||
FireRedAsr: TSherpaOnnxOfflineFireRedAsrModelConfig;
|
||||
Dolphin: TSherpaOnnxOfflineDolphinModelConfig;
|
||||
ZipformerCtc: TSherpaOnnxOfflineZipformerCtcModelConfig;
|
||||
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig);
|
||||
function ToString: AnsiString;
|
||||
end;
|
||||
@@ -726,6 +732,9 @@ type
|
||||
SherpaOnnxOfflineDolphinModelConfig = record
|
||||
Model: PAnsiChar;
|
||||
end;
|
||||
SherpaOnnxOfflineZipformerCtcModelConfig = record
|
||||
Model: PAnsiChar;
|
||||
end;
|
||||
SherpaOnnxOfflineWhisperModelConfig = record
|
||||
Encoder: PAnsiChar;
|
||||
Decoder: PAnsiChar;
|
||||
@@ -773,6 +782,7 @@ type
|
||||
Moonshine: SherpaOnnxOfflineMoonshineModelConfig;
|
||||
FireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig;
|
||||
Dolphin: SherpaOnnxOfflineDolphinModelConfig;
|
||||
ZipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig;
|
||||
end;
|
||||
|
||||
SherpaOnnxOfflineRecognizerConfig = record
|
||||
@@ -1536,6 +1546,12 @@ begin
|
||||
[Self.Model]);
|
||||
end;
|
||||
|
||||
function TSherpaOnnxOfflineZipformerCtcModelConfig.ToString: AnsiString;
|
||||
begin
|
||||
Result := Format('TSherpaOnnxOfflineZipformerCtcModelConfig(Model := %s)',
|
||||
[Self.Model]);
|
||||
end;
|
||||
|
||||
function TSherpaOnnxOfflineWhisperModelConfig.ToString: AnsiString;
|
||||
begin
|
||||
Result := Format('TSherpaOnnxOfflineWhisperModelConfig(' +
|
||||
@@ -1610,14 +1626,15 @@ begin
|
||||
'SenseVoice := %s, ' +
|
||||
'Moonshine := %s, ' +
|
||||
'FireRedAsr := %s, ' +
|
||||
'Dolphin := %s' +
|
||||
'Dolphin := %s, ' +
|
||||
'ZipformerCtc := %s' +
|
||||
')',
|
||||
[Self.Transducer.ToString, Self.Paraformer.ToString,
|
||||
Self.NeMoCtc.ToString, Self.Whisper.ToString, Self.Tdnn.ToString,
|
||||
Self.Tokens, Self.NumThreads, Self.Debug.ToString, Self.Provider,
|
||||
Self.ModelType, Self.ModelingUnit, Self.BpeVocab,
|
||||
Self.TeleSpeechCtc, Self.SenseVoice.ToString, Self.Moonshine.ToString,
|
||||
Self.FireRedAsr.ToString, Self.Dolphin.ToString
|
||||
Self.FireRedAsr.ToString, Self.Dolphin.ToString, Self.ZipformerCtc.ToString
|
||||
]);
|
||||
end;
|
||||
|
||||
@@ -1688,6 +1705,7 @@ begin
|
||||
C.ModelConfig.FireRedAsr.Decoder := PAnsiChar(Config.ModelConfig.FireRedAsr.Decoder);
|
||||
|
||||
C.ModelConfig.Dolphin.Model := PAnsiChar(Config.ModelConfig.Dolphin.Model);
|
||||
C.ModelConfig.ZipformerCtc.Model := PAnsiChar(Config.ModelConfig.ZipformerCtc.Model);
|
||||
|
||||
C.LMConfig.Model := PAnsiChar(Config.LMConfig.Model);
|
||||
C.LMConfig.Scale := Config.LMConfig.Scale;
|
||||
|
||||
@@ -527,6 +527,87 @@ class OfflineRecognizer(object):
|
||||
self.config = recognizer_config
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_zipformer_ctc(
|
||||
cls,
|
||||
model: str,
|
||||
tokens: str,
|
||||
num_threads: int = 1,
|
||||
sample_rate: int = 16000,
|
||||
feature_dim: int = 80,
|
||||
decoding_method: str = "greedy_search",
|
||||
debug: bool = False,
|
||||
provider: str = "cpu",
|
||||
rule_fsts: str = "",
|
||||
rule_fars: str = "",
|
||||
hr_dict_dir: str = "",
|
||||
hr_rule_fsts: str = "",
|
||||
hr_lexicon: str = "",
|
||||
):
|
||||
"""
|
||||
Please refer to
|
||||
`<https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/index.html>`_
|
||||
to download pre-trained models for different languages, e.g., Chinese,
|
||||
English, etc.
|
||||
|
||||
Args:
|
||||
model:
|
||||
Path to ``model.onnx``.
|
||||
tokens:
|
||||
Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
|
||||
columns::
|
||||
|
||||
symbol integer_id
|
||||
|
||||
num_threads:
|
||||
Number of threads for neural network computation.
|
||||
sample_rate:
|
||||
Sample rate of the training data used to train the model.
|
||||
feature_dim:
|
||||
Dimension of the feature used to train the model.
|
||||
decoding_method:
|
||||
Valid values are greedy_search.
|
||||
debug:
|
||||
True to show debug messages.
|
||||
provider:
|
||||
onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
|
||||
rule_fsts:
|
||||
If not empty, it specifies fsts for inverse text normalization.
|
||||
If there are multiple fsts, they are separated by a comma.
|
||||
rule_fars:
|
||||
If not empty, it specifies fst archives for inverse text normalization.
|
||||
If there are multiple archives, they are separated by a comma.
|
||||
"""
|
||||
self = cls.__new__(cls)
|
||||
model_config = OfflineModelConfig(
|
||||
zipformer_ctc=OfflineZipformerCtcModelConfig(model=model),
|
||||
tokens=tokens,
|
||||
num_threads=num_threads,
|
||||
debug=debug,
|
||||
provider=provider,
|
||||
)
|
||||
|
||||
feat_config = FeatureExtractorConfig(
|
||||
sampling_rate=sample_rate,
|
||||
feature_dim=feature_dim,
|
||||
)
|
||||
|
||||
recognizer_config = OfflineRecognizerConfig(
|
||||
feat_config=feat_config,
|
||||
model_config=model_config,
|
||||
decoding_method=decoding_method,
|
||||
rule_fsts=rule_fsts,
|
||||
rule_fars=rule_fars,
|
||||
hr=HomophoneReplacerConfig(
|
||||
dict_dir=hr_dict_dir,
|
||||
lexicon=hr_lexicon,
|
||||
rule_fsts=hr_rule_fsts,
|
||||
),
|
||||
)
|
||||
self.recognizer = _Recognizer(recognizer_config)
|
||||
self.config = recognizer_config
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def from_nemo_ctc(
|
||||
cls,
|
||||
|
||||
3
swift-api-examples/.gitignore
vendored
3
swift-api-examples/.gitignore
vendored
@@ -16,3 +16,6 @@ tts-kokoro-en
|
||||
tts-kokoro-zh-en
|
||||
speech-enhancement-gtcrn
|
||||
decode-file-sense-voice-with-hr
|
||||
test-version
|
||||
zipformer-ctc-asr
|
||||
dolphin-ctc-asr
|
||||
|
||||
@@ -346,6 +346,14 @@ func sherpaOnnxOfflineParaformerModelConfig(
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineZipformerCtcModelConfig(
|
||||
model: String = ""
|
||||
) -> SherpaOnnxOfflineZipformerCtcModelConfig {
|
||||
return SherpaOnnxOfflineZipformerCtcModelConfig(
|
||||
model: toCPointer(model)
|
||||
)
|
||||
}
|
||||
|
||||
func sherpaOnnxOfflineNemoEncDecCtcModelConfig(
|
||||
model: String = ""
|
||||
) -> SherpaOnnxOfflineNemoEncDecCtcModelConfig {
|
||||
@@ -449,7 +457,9 @@ func sherpaOnnxOfflineModelConfig(
|
||||
senseVoice: SherpaOnnxOfflineSenseVoiceModelConfig = sherpaOnnxOfflineSenseVoiceModelConfig(),
|
||||
moonshine: SherpaOnnxOfflineMoonshineModelConfig = sherpaOnnxOfflineMoonshineModelConfig(),
|
||||
fireRedAsr: SherpaOnnxOfflineFireRedAsrModelConfig = sherpaOnnxOfflineFireRedAsrModelConfig(),
|
||||
dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig()
|
||||
dolphin: SherpaOnnxOfflineDolphinModelConfig = sherpaOnnxOfflineDolphinModelConfig(),
|
||||
zipformerCtc: SherpaOnnxOfflineZipformerCtcModelConfig =
|
||||
sherpaOnnxOfflineZipformerCtcModelConfig()
|
||||
) -> SherpaOnnxOfflineModelConfig {
|
||||
return SherpaOnnxOfflineModelConfig(
|
||||
transducer: transducer,
|
||||
@@ -468,7 +478,8 @@ func sherpaOnnxOfflineModelConfig(
|
||||
sense_voice: senseVoice,
|
||||
moonshine: moonshine,
|
||||
fire_red_asr: fireRedAsr,
|
||||
dolphin: dolphin
|
||||
dolphin: dolphin,
|
||||
zipformer_ctc: zipformerCtc
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
43
swift-api-examples/run-zipformer-ctc-asr.sh
Executable file
43
swift-api-examples/run-zipformer-ctc-asr.sh
Executable file
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
if [ ! -d ../build-swift-macos ]; then
|
||||
echo "Please run ../build-swift-macos.sh first!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f ./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx ]; then
|
||||
echo "Please download the pre-trained model for testing."
|
||||
echo "You can refer to"
|
||||
echo ""
|
||||
echo "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/icefall/zipformer.html#sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03-chinese"
|
||||
echo ""
|
||||
echo "for help"
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
|
||||
tar xvf sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
rm sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03.tar.bz2
|
||||
ls -lh sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03
|
||||
fi
|
||||
|
||||
if [ ! -e ./zipformer-ctc-asr ]; then
|
||||
# Note: We use -lc++ to link against libc++ instead of libstdc++
|
||||
swiftc \
|
||||
-lc++ \
|
||||
-I ../build-swift-macos/install/include \
|
||||
-import-objc-header ./SherpaOnnx-Bridging-Header.h \
|
||||
./zipformer-ctc-asr.swift ./SherpaOnnx.swift \
|
||||
-L ../build-swift-macos/install/lib/ \
|
||||
-l sherpa-onnx \
|
||||
-l onnxruntime \
|
||||
-o zipformer-ctc-asr
|
||||
|
||||
strip zipformer-ctc-asr
|
||||
else
|
||||
echo "./zipformer-ctc-asr exists - skip building"
|
||||
fi
|
||||
|
||||
export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH
|
||||
./zipformer-ctc-asr
|
||||
66
swift-api-examples/zipformer-ctc-asr.swift
Normal file
66
swift-api-examples/zipformer-ctc-asr.swift
Normal file
@@ -0,0 +1,66 @@
|
||||
import AVFoundation
|
||||
|
||||
extension AudioBuffer {
|
||||
func array() -> [Float] {
|
||||
return Array(UnsafeBufferPointer(self))
|
||||
}
|
||||
}
|
||||
|
||||
extension AVAudioPCMBuffer {
|
||||
func array() -> [Float] {
|
||||
return self.audioBufferList.pointee.mBuffers.array()
|
||||
}
|
||||
}
|
||||
|
||||
func run() {
|
||||
let model = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/model.int8.onnx"
|
||||
let tokens = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/tokens.txt"
|
||||
|
||||
let zipformerCtc = sherpaOnnxOfflineZipformerCtcModelConfig(
|
||||
model: model
|
||||
)
|
||||
|
||||
let modelConfig = sherpaOnnxOfflineModelConfig(
|
||||
tokens: tokens,
|
||||
debug: 0,
|
||||
zipformerCtc: zipformerCtc
|
||||
)
|
||||
|
||||
let featConfig = sherpaOnnxFeatureConfig(
|
||||
sampleRate: 16000,
|
||||
featureDim: 80
|
||||
)
|
||||
var config = sherpaOnnxOfflineRecognizerConfig(
|
||||
featConfig: featConfig,
|
||||
modelConfig: modelConfig
|
||||
)
|
||||
|
||||
let recognizer = SherpaOnnxOfflineRecognizer(config: &config)
|
||||
|
||||
let filePath = "./sherpa-onnx-zipformer-ctc-zh-int8-2025-07-03/test_wavs/0.wav"
|
||||
let fileURL: NSURL = NSURL(fileURLWithPath: filePath)
|
||||
let audioFile = try! AVAudioFile(forReading: fileURL as URL)
|
||||
|
||||
let audioFormat = audioFile.processingFormat
|
||||
assert(audioFormat.channelCount == 1)
|
||||
assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32)
|
||||
|
||||
let audioFrameCount = UInt32(audioFile.length)
|
||||
let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount)
|
||||
|
||||
try! audioFile.read(into: audioFileBuffer!)
|
||||
let array: [Float]! = audioFileBuffer?.array()
|
||||
let result = recognizer.decode(samples: array, sampleRate: Int(audioFormat.sampleRate))
|
||||
print("\nresult is:\n\(result.text)")
|
||||
if result.timestamps.count != 0 {
|
||||
print("\ntimestamps is:\n\(result.timestamps)")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() {
|
||||
run()
|
||||
}
|
||||
}
|
||||
@@ -43,6 +43,10 @@ function freeConfig(config, Module) {
|
||||
freeConfig(config.dolphin, Module)
|
||||
}
|
||||
|
||||
if ('zipformerCtc' in config) {
|
||||
freeConfig(config.zipformerCtc, Module)
|
||||
}
|
||||
|
||||
if ('moonshine' in config) {
|
||||
freeConfig(config.moonshine, Module)
|
||||
}
|
||||
@@ -627,6 +631,23 @@ function initSherpaOnnxOfflineDolphinModelConfig(config, Module) {
|
||||
}
|
||||
}
|
||||
|
||||
function initSherpaOnnxOfflineZipformerCtcModelConfig(config, Module) {
|
||||
const n = Module.lengthBytesUTF8(config.model || '') + 1;
|
||||
|
||||
const buffer = Module._malloc(n);
|
||||
|
||||
const len = 1 * 4; // 1 pointer
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
Module.stringToUTF8(config.model || '', buffer, n);
|
||||
|
||||
Module.setValue(ptr, buffer, 'i8*');
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len,
|
||||
}
|
||||
}
|
||||
|
||||
function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
|
||||
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
|
||||
const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
|
||||
@@ -840,6 +861,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
||||
};
|
||||
}
|
||||
|
||||
if (!('zipformerCtc' in config)) {
|
||||
config.zipformerCtc = {
|
||||
model: '',
|
||||
};
|
||||
}
|
||||
|
||||
if (!('whisper' in config)) {
|
||||
config.whisper = {
|
||||
encoder: '',
|
||||
@@ -906,9 +933,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
||||
const dolphin =
|
||||
initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module);
|
||||
|
||||
const zipformerCtc =
|
||||
initSherpaOnnxOfflineZipformerCtcModelConfig(config.zipformerCtc, Module);
|
||||
|
||||
const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
|
||||
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
|
||||
dolphin.len;
|
||||
dolphin.len + zipformerCtc.len;
|
||||
|
||||
const ptr = Module._malloc(len);
|
||||
|
||||
@@ -1010,11 +1040,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
|
||||
Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset);
|
||||
offset += dolphin.len;
|
||||
|
||||
Module._CopyHeap(zipformerCtc.ptr, zipformerCtc.len, ptr + offset);
|
||||
offset += zipformerCtc.len;
|
||||
|
||||
return {
|
||||
buffer: buffer, ptr: ptr, len: len, transducer: transducer,
|
||||
paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
|
||||
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
|
||||
dolphin: dolphin
|
||||
dolphin: dolphin, zipformerCtc: zipformerCtc
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ extern "C" {
|
||||
static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, "");
|
||||
|
||||
static_assert(sizeof(SherpaOnnxOfflineZipformerCtcModelConfig) == 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, "");
|
||||
static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, "");
|
||||
@@ -31,7 +32,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
|
||||
sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineDolphinModelConfig),
|
||||
sizeof(SherpaOnnxOfflineDolphinModelConfig) +
|
||||
sizeof(SherpaOnnxOfflineZipformerCtcModelConfig),
|
||||
|
||||
"");
|
||||
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
|
||||
@@ -77,6 +79,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
|
||||
auto moonshine = &model_config->moonshine;
|
||||
auto fire_red_asr = &model_config->fire_red_asr;
|
||||
auto dolphin = &model_config->dolphin;
|
||||
auto zipformer_ctc = &model_config->zipformer_ctc;
|
||||
|
||||
fprintf(stdout, "----------offline transducer model config----------\n");
|
||||
fprintf(stdout, "encoder: %s\n", transducer->encoder);
|
||||
@@ -117,6 +120,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
|
||||
fprintf(stdout, "----------offline Dolphin model config----------\n");
|
||||
fprintf(stdout, "model: %s\n", dolphin->model);
|
||||
|
||||
fprintf(stdout, "----------offline zipformer ctc model config----------\n");
|
||||
fprintf(stdout, "model: %s\n", zipformer_ctc->model);
|
||||
|
||||
fprintf(stdout, "tokens: %s\n", model_config->tokens);
|
||||
fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
|
||||
fprintf(stdout, "provider: %s\n", model_config->provider);
|
||||
|
||||
@@ -117,6 +117,10 @@ function initOfflineRecognizer() {
|
||||
};
|
||||
} else if (fileExists('dolphin.onnx')) {
|
||||
config.modelConfig.dolphin = {model: './dolphin.onnx'};
|
||||
} else if (fileExists('zipformer-ctc.onnx')) {
|
||||
// you need to rename model.int8.onnx from zipformer CTC to
|
||||
// zipformer-ctc.onnx
|
||||
config.modelConfig.zipformerCtc = {model: './zipformer-ctc.onnx'};
|
||||
} else {
|
||||
console.log('Please specify a model.');
|
||||
alert('Please specify a model.');
|
||||
|
||||
Reference in New Issue
Block a user