diff --git a/.github/workflows/export-moonshine-to-onnx.yaml b/.github/workflows/export-moonshine-to-onnx.yaml new file mode 100644 index 00000000..2e73c2e0 --- /dev/null +++ b/.github/workflows/export-moonshine-to-onnx.yaml @@ -0,0 +1,106 @@ +name: export-moonshine-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-moonshine-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-moonshine-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export moonshine models to ONNX + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + shell: bash + run: | + pip install -q onnx onnxruntime librosa tokenizers soundfile + + - name: Run + shell: bash + run: | + pushd scripts/moonshine + ./run.sh + popd + + mv -v scripts/moonshine/*.tar.bz2 . + mv -v scripts/moonshine/sherpa-onnx-* ./ + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models + + - name: Publish to huggingface (tiny) + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + d=sherpa-onnx-moonshine-tiny-en-int8 + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface + mv -v $d/* ./huggingface + cd huggingface + git lfs track "*.onnx" + git lfs track "*.wav" + git status + git add . + git status + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main + rm -rf huggingface + + - name: Publish to huggingface (base) + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + d=sherpa-onnx-moonshine-base-en-int8 + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d huggingface + mv -v $d/* ./huggingface + cd huggingface + git lfs track "*.onnx" + git lfs track "*.wav" + git status + git add . + git status + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$d main + rm -rf huggingface diff --git a/scripts/moonshine/.gitignore b/scripts/moonshine/.gitignore new file mode 100644 index 00000000..c78219ab --- /dev/null +++ b/scripts/moonshine/.gitignore @@ -0,0 +1 @@ +tokenizer.json diff --git a/scripts/moonshine/README.md b/scripts/moonshine/README.md new file mode 100644 index 00000000..b9c5e37f --- /dev/null +++ b/scripts/moonshine/README.md @@ -0,0 +1,7 @@ +# Introduction + +This directory contains models from +https://github.com/usefulsensors/moonshine + +See its license at +https://github.com/usefulsensors/moonshine/blob/main/LICENSE diff --git a/scripts/moonshine/export-onnx.py b/scripts/moonshine/export-onnx.py new file mode 100755 index 00000000..6b0a3472 --- /dev/null +++ b/scripts/moonshine/export-onnx.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +from pathlib import Path + +import tokenizers +from onnxruntime.quantization import QuantType, quantize_dynamic + + +def generate_tokens(): + if Path("./tokens.txt").is_file(): + return + print("Generating tokens.txt") + tokenizer = tokenizers.Tokenizer.from_file("./tokenizer.json") + vocab_size = tokenizer.get_vocab_size() + with open("tokens.txt", "w", encoding="utf-8") as f: + for i in range(vocab_size): + s = tokenizer.id_to_token(i).strip() + f.write(f"{s}\t{i}\n") + + +def main(): + generate_tokens() + + # Note(fangjun): Don't use int8 for the preprocessor since it has + # a larger impact on the accuracy + for f in ["uncached_decode", "cached_decode", "encode"]: + if Path(f"{f}.int8.onnx").is_file(): + continue + + print("processing", f) + quantize_dynamic( + model_input=f"{f}.onnx", + model_output=f"{f}.int8.onnx", + weight_type=QuantType.QInt8, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/moonshine/run.sh b/scripts/moonshine/run.sh new file mode 100755 index 00000000..0ad13966 --- /dev/null +++ b/scripts/moonshine/run.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) +set -ex + +cat >LICENSE <"] + eos = token2id[""] + + tokens = [sos] + + encoder_out = model.run_encode(features) + print("encoder_out.shape", encoder_out.shape) # (1, 413, 288) + + logits, states = model.run_uncached_decode( + token=tokens[-1], + token_len=len(tokens), + encoder_out=encoder_out, + ) + + print("logits.shape", logits.shape) # (1, 1, 32768) + print("len(states)", len(states)) # 24 + + max_len = int((audio.shape[-1] / 16000) * 6) + + for i in range(max_len): + token = logits.squeeze().argmax() + if token == eos: + break + tokens.append(token) + + logits, states = model.run_cached_decode( + token=tokens[-1], + token_len=len(tokens), + encoder_out=encoder_out, + states=states, + ) + + tokens = tokens[1:] # remove sos + words = [id2token[i] for i in tokens] + underline = "▁" + # underline = b"\xe2\x96\x81".decode() + text = "".join(words).replace(underline, " ").strip() + + end_t = dt.datetime.now() + t = (end_t - start_t).total_seconds() + rtf = t * 16000 / audio.shape[-1] + + print(text) + print("RTF:", rtf) + + +if __name__ == "__main__": + main()