diff --git a/.github/workflows/export-libriheavy.yaml b/.github/workflows/export-libriheavy.yaml new file mode 100644 index 00000000..cfe0a28d --- /dev/null +++ b/.github/workflows/export-libriheavy.yaml @@ -0,0 +1,136 @@ +name: export-libriheavy-to-onnx + +on: + push: + branches: + - libriheavy-model + workflow_dispatch: + +concurrency: + group: export-libriheavy-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-libriheavy-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export libriheavy + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Run + shell: bash + run: | + cd scripts/icefall + ./run-libriheavy.sh + ./run-libriheavy-punct-case.sh + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + for m in large medium small; do + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + src=sherpa-onnx-zipformer-en-libriheavy-20230926-$m + echo "Process $src" + + git clone https://huggingface.co/csukuangfj/$src huggingface + cd huggingface + git fetch + git pull + + cp -av ../scripts/icefall/$src/* . + + git lfs track "*.onnx" + git add . + + git commit -m "add large" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src main || true + + cd .. + + rm -rf huggingface/.git* + + mv huggingface $src + + tar cjvf $src.tar.bz2 $src + rm -rf $src + ls -lh + done + + - name: Publish to huggingface (case and punct) + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + for m in large medium small; do + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + src=sherpa-onnx-zipformer-en-libriheavy-20230830-$m-punct-case + echo "Process $src" + + git clone https://huggingface.co/csukuangfj/$src huggingface + cd huggingface + git fetch + git pull + + cp -av ../scripts/icefall/$src/* . + + git lfs track "*.onnx" + git add . + + git commit -m "add large" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$src main || true + + cd .. + + rm -rf huggingface/.git* + + mv huggingface $src + + tar cjvf $src.tar.bz2 $src + rm -rf $src + ls -lh + done + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models + diff --git a/scripts/apk/generate-asr-2pass-apk-script.py b/scripts/apk/generate-asr-2pass-apk-script.py index 1b3d7523..9a85f35b 100755 --- a/scripts/apk/generate-asr-2pass-apk-script.py +++ b/scripts/apk/generate-asr-2pass-apk-script.py @@ -80,7 +80,7 @@ def get_2nd_models(): rm -fv README.md rm -rfv test_wavs - rm model.onnx + rm -fv model.onnx ls -lh diff --git a/scripts/apk/generate-vad-asr-apk-script.py b/scripts/apk/generate-vad-asr-apk-script.py index a42d23c2..26726bc9 100755 --- a/scripts/apk/generate-vad-asr-apk-script.py +++ b/scripts/apk/generate-vad-asr-apk-script.py @@ -82,7 +82,7 @@ def get_models(): rm -fv README.md rm -rfv test_wavs - rm model.onnx + rm -fv model.onnx ls -lh @@ -189,7 +189,7 @@ def get_models(): pushd $model_name rm -rfv test_wavs - rm test.py + rm -fv test.py ls -lh @@ -208,8 +208,8 @@ def get_models(): rm -fv README.md rm -fv bpe.model - rm encoder-epoch-12-avg-5.onnx - rm decoder-epoch-12-avg-5.int8.onnx + rm -fv encoder-epoch-12-avg-5.onnx + rm -fv decoder-epoch-12-avg-5.int8.onnx rm joiner-epoch-12-avg-5.onnx ls -lh @@ -229,9 +229,9 @@ def get_models(): rm -fv README.md rm -fv bpe.model - rm encoder-epoch-99-avg-1.onnx - rm decoder-epoch-99-avg-1.int8.onnx - rm joiner-epoch-99-avg-1.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv joiner-epoch-99-avg-1.onnx ls -lh diff --git a/scripts/icefall/run-libriheavy-punct-case.sh b/scripts/icefall/run-libriheavy-punct-case.sh new file mode 100755 index 00000000..e7179883 --- /dev/null +++ b/scripts/icefall/run-libriheavy-punct-case.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +set -ex + +cur_dir=$(cd $(dirname $BASH_SOURCE) && pwd) +sherpa_onnx_dir=$(cd $cur_dir/../.. && pwd) +echo "sherpa_onnx_dir: $sherpa_onnx_dir" + +pip install sherpa-onnx # for testing + +function download_model() { + git lfs install + git clone https://www.modelscope.cn/pkufool/icefall-asr-zipformer-libriheavy-punc-20230830.git +} + +function download_test_wavs() { + d=$1 + mkdir $d/test_wavs + pushd $d/test_wavs + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/0.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/1.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/8k.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/trans.txt + popd +} + +function export_large() { + echo "----------large----------" + src=icefall-asr-zipformer-libriheavy-punc-20230830 + dst=sherpa-onnx-zipformer-en-libriheavy-20230830-large-punct-case + mkdir $dst + + cp -v $src/data/lang_bpe_756/bpe.model $dst/ + cp -v $src/data/lang_bpe_756/tokens.txt $dst/ + cp -v $src/exp/*.onnx $dst/ + download_test_wavs $dst + + ls -lh $dst + ls -lh $dst/test_wavs + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-16-avg-2.onnx \ + --decoder=$dst/decoder-epoch-16-avg-2.onnx \ + --joiner=$dst/joiner-epoch-16-avg-2.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-16-avg-2.int8.onnx \ + --decoder=$dst/decoder-epoch-16-avg-2.onnx \ + --joiner=$dst/joiner-epoch-16-avg-2.int8.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav +} + +function export_medium() { + echo "----------medium subset----------" + src=icefall-asr-zipformer-libriheavy-punc-20230830 + dst=sherpa-onnx-zipformer-en-libriheavy-20230830-medium-punct-case + mkdir $dst + + cp -v $src/data/lang_bpe_756/bpe.model $dst/ + cp -v $src/data/lang_bpe_756/tokens.txt $dst/ + cp -v $src/exp_medium_subset/*.onnx $dst/ + download_test_wavs $dst + + ls -lh $dst + ls -lh $dst/test_wavs + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-50-avg-15.onnx \ + --decoder=$dst/decoder-epoch-50-avg-15.onnx \ + --joiner=$dst/joiner-epoch-50-avg-15.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-50-avg-15.int8.onnx \ + --decoder=$dst/decoder-epoch-50-avg-15.onnx \ + --joiner=$dst/joiner-epoch-50-avg-15.int8.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav +} + +function export_small() { + echo "----------small subset----------" + src=icefall-asr-zipformer-libriheavy-punc-20230830 + dst=sherpa-onnx-zipformer-en-libriheavy-20230830-small-punct-case + mkdir $dst + + cp -v $src/data/lang_bpe_756/bpe.model $dst/ + cp -v $src/data/lang_bpe_756/tokens.txt $dst/ + cp -v $src/exp_small_subset/*.onnx $dst/ + download_test_wavs $dst + + ls -lh $dst + ls -lh $dst/test_wavs + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-88-avg-41.onnx \ + --decoder=$dst/decoder-epoch-88-avg-41.onnx \ + --joiner=$dst/joiner-epoch-88-avg-41.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-88-avg-41.int8.onnx \ + --decoder=$dst/decoder-epoch-88-avg-41.onnx \ + --joiner=$dst/joiner-epoch-88-avg-41.int8.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav +} + +download_model + +export_large +export_medium +export_small + +rm -rf icefall-asr-zipformer-libriheavy-punc-20230830 diff --git a/scripts/icefall/run-libriheavy.sh b/scripts/icefall/run-libriheavy.sh new file mode 100755 index 00000000..7b4032d0 --- /dev/null +++ b/scripts/icefall/run-libriheavy.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash + +set -ex + +cur_dir=$(cd $(dirname $BASH_SOURCE) && pwd) +sherpa_onnx_dir=$(cd $cur_dir/../.. && pwd) +echo "sherpa_onnx_dir: $sherpa_onnx_dir" + +pip install sherpa-onnx # for testing + +function download_model() { + git lfs install + git clone https://www.modelscope.cn/pkufool/icefall-asr-zipformer-libriheavy-20230926.git +} + +function download_test_wavs() { + d=$1 + mkdir $d/test_wavs + pushd $d/test_wavs + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/0.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/1.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/8k.wav + curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/trans.txt + popd +} + +function export_large() { + echo "----------large----------" + src=icefall-asr-zipformer-libriheavy-20230926 + dst=sherpa-onnx-zipformer-en-libriheavy-20230926-large + mkdir $dst + + cp -v $src/data/lang_bpe_500/bpe.model $dst/ + cp -v $src/data/lang_bpe_500/tokens.txt $dst/ + cp -v $src/exp/*.onnx $dst/ + download_test_wavs $dst + + ls -lh $dst + ls -lh $dst/test_wavs + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-16-avg-3.onnx \ + --decoder=$dst/decoder-epoch-16-avg-3.onnx \ + --joiner=$dst/joiner-epoch-16-avg-3.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-16-avg-3.int8.onnx \ + --decoder=$dst/decoder-epoch-16-avg-3.onnx \ + --joiner=$dst/joiner-epoch-16-avg-3.int8.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav +} + +function export_medium() { + echo "----------medium subset----------" + src=icefall-asr-zipformer-libriheavy-20230926 + dst=sherpa-onnx-zipformer-en-libriheavy-20230926-medium + mkdir $dst + + cp -v $src/data/lang_bpe_500/bpe.model $dst/ + cp -v $src/data/lang_bpe_500/tokens.txt $dst/ + cp -v $src/exp_medium_subset/*.onnx $dst/ + download_test_wavs $dst + + ls -lh $dst + ls -lh $dst/test_wavs + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-60-avg-20.onnx \ + --decoder=$dst/decoder-epoch-60-avg-20.onnx \ + --joiner=$dst/joiner-epoch-60-avg-20.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-60-avg-20.int8.onnx \ + --decoder=$dst/decoder-epoch-60-avg-20.onnx \ + --joiner=$dst/joiner-epoch-60-avg-20.int8.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav +} + +function export_small() { + echo "----------small subset----------" + src=icefall-asr-zipformer-libriheavy-20230926 + dst=sherpa-onnx-zipformer-en-libriheavy-20230926-small + mkdir $dst + + cp -v $src/data/lang_bpe_500/bpe.model $dst/ + cp -v $src/data/lang_bpe_500/tokens.txt $dst/ + cp -v $src/exp_small_subset/*.onnx $dst/ + download_test_wavs $dst + + ls -lh $dst + ls -lh $dst/test_wavs + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-90-avg-20.onnx \ + --decoder=$dst/decoder-epoch-90-avg-20.onnx \ + --joiner=$dst/joiner-epoch-90-avg-20.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav + + sherpa-onnx-offline \ + --encoder=$dst/encoder-epoch-90-avg-20.int8.onnx \ + --decoder=$dst/decoder-epoch-90-avg-20.onnx \ + --joiner=$dst/joiner-epoch-90-avg-20.int8.onnx \ + --tokens=$dst/tokens.txt \ + $dst/test_wavs/0.wav \ + $dst/test_wavs/1.wav \ + $dst/test_wavs/8k.wav +} + +download_model + +export_large +export_medium +export_small + +rm -rf icefall-asr-zipformer-libriheavy-20230926