diff --git a/.github/scripts/test-offline-whisper.sh b/.github/scripts/test-offline-whisper.sh new file mode 100755 index 00000000..75a7ee53 --- /dev/null +++ b/.github/scripts/test-offline-whisper.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +set -e + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +echo "EXE is $EXE" +echo "PATH: $PATH" + +which $EXE + +names=( +tiny.en +base.en +# small.en +# medium.en +) + +for name in ${names[@]}; do + log "------------------------------------------------------------" + log "Run $name" + log "------------------------------------------------------------" + + repo_url=https://huggingface.co/csukuangfj/sherpa-onnx-whisper-$name + log "Start testing ${repo_url}" + repo=$(basename $repo_url) + log "Download pretrained model and test-data from $repo_url" + + GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url + pushd $repo + git lfs pull --include "*.onnx" + git lfs pull --include "*.ort" + ls -lh *.{onnx,ort} + popd + + log "test fp32 onnx" + + time $EXE \ + --tokens=$repo/${name}-tokens.txt \ + --whisper-encoder=$repo/${name}-encoder.onnx \ + --whisper-decoder=$repo/${name}-decoder.onnx \ + --num-threads=2 \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/8k.wav + + log "test int8 onnx" + + time $EXE \ + --tokens=$repo/${name}-tokens.txt \ + --whisper-encoder=$repo/${name}-encoder.int8.onnx \ + --whisper-decoder=$repo/${name}-decoder.int8.onnx \ + --num-threads=2 \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/8k.wav + + log "test fp32 ort" + + time $EXE \ + --tokens=$repo/${name}-tokens.txt \ + --whisper-encoder=$repo/${name}-encoder.ort \ + --whisper-decoder=$repo/${name}-decoder.ort \ + --num-threads=2 \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/8k.wav + + log "test int8 ort" + + time $EXE \ + --tokens=$repo/${name}-tokens.txt \ + --whisper-encoder=$repo/${name}-encoder.int8.ort \ + --whisper-decoder=$repo/${name}-decoder.int8.ort \ + --num-threads=2 \ + $repo/test_wavs/0.wav \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/8k.wav + + rm -rf $repo +done diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 0b8036ea..a03602ba 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -84,6 +84,14 @@ jobs: file build/bin/sherpa-onnx readelf -d build/bin/sherpa-onnx + - name: Test offline Whisper + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline + + .github/scripts/test-offline-whisper.sh + - name: Test offline CTC shell: bash run: | diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 37b27f93..cebc5ac6 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -82,6 +82,14 @@ jobs: otool -L build/bin/sherpa-onnx otool -l build/bin/sherpa-onnx + - name: Test offline Whisper + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline + + .github/scripts/test-offline-whisper.sh + - name: Test offline CTC shell: bash run: | diff --git a/.github/workflows/windows-x64-cuda.yaml b/.github/workflows/windows-x64-cuda.yaml index 95e66841..24b8158d 100644 --- a/.github/workflows/windows-x64-cuda.yaml +++ b/.github/workflows/windows-x64-cuda.yaml @@ -74,6 +74,14 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe + - name: Test offline Whisper for windows x64 + shell: bash + run: | + export PATH=$PWD/build/bin/Release:$PATH + export EXE=sherpa-onnx-offline.exe + + .github/scripts/test-offline-whisper.sh + - name: Test offline CTC for windows x64 shell: bash run: | diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml index a9039b0c..83b80de9 100644 --- a/.github/workflows/windows-x64.yaml +++ b/.github/workflows/windows-x64.yaml @@ -75,6 +75,14 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe + - name: Test offline Whisper for windows x64 + shell: bash + run: | + export PATH=$PWD/build/bin/Release:$PATH + export EXE=sherpa-onnx-offline.exe + + .github/scripts/test-offline-whisper.sh + - name: Test offline CTC for windows x64 shell: bash run: | diff --git a/.github/workflows/windows-x86.yaml b/.github/workflows/windows-x86.yaml index f23a8224..d181e22c 100644 --- a/.github/workflows/windows-x86.yaml +++ b/.github/workflows/windows-x86.yaml @@ -73,6 +73,14 @@ jobs: ls -lh ./bin/Release/sherpa-onnx.exe + - name: Test offline Whisper for windows x86 + shell: bash + run: | + export PATH=$PWD/build/bin/Release:$PATH + export EXE=sherpa-onnx-offline.exe + + .github/scripts/test-offline-whisper.sh + - name: Test offline CTC for windows x86 shell: bash run: | diff --git a/scripts/whisper/README.md b/scripts/whisper/README.md index eda44148..82f58a5d 100644 --- a/scripts/whisper/README.md +++ b/scripts/whisper/README.md @@ -5,5 +5,9 @@ and use onnxruntime to replace PyTorch for speech recognition. You can use [sherpa-onnx][sherpa-onnx] to run the converted model. +Please see +https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/export-onnx.html +for details. + [whisper]: https://github.com/openai/whisper [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx diff --git a/scripts/whisper/test.py b/scripts/whisper/test.py index d1c422bf..38a5a8a3 100755 --- a/scripts/whisper/test.py +++ b/scripts/whisper/test.py @@ -18,15 +18,30 @@ import argparse def get_args(): parser = argparse.ArgumentParser() parser.add_argument( - "--model", + "--encoder", type=str, required=True, - # fmt: off - choices=[ - "tiny", "tiny.en", "base", "base.en", - "small", "small.en", "medium", "medium.en", - "large", "large-v1", "large-v2"], - # fmt: on + help="Path to the encoder", + ) + + parser.add_argument( + "--decoder", + type=str, + required=True, + help="Path to the decoder", + ) + + parser.add_argument( + "--tokens", + type=str, + required=True, + help="Path to the tokens", + ) + + parser.add_argument( + "sound_file", + type=str, + help="Path to the test wave", ) return parser.parse_args() @@ -161,11 +176,10 @@ def load_tokens(filename): def main(): args = get_args() - name = args.model + encoder = args.encoder + decoder = args.decoder - encoder = f"./{name}-encoder.onnx" - decoder = f"./{name}-decoder.onnx" - audio = whisper.load_audio("0.wav") + audio = whisper.load_audio(args.sound_file) features = [] online_whisper_fbank = knf.OnlineWhisperFbank(knf.FrameExtractionOptions()) @@ -224,17 +238,13 @@ def main(): logits = logits[0, -1] model.suppress_tokens(logits, is_initial=False) max_token_id = logits.argmax(dim=-1) - token_table = load_tokens(f"./{name}-tokens.txt") + token_table = load_tokens(args.tokens) s = b"" for i in results: if i in token_table: s += base64.b64decode(token_table[i]) - else: - print("oov", i) print(s.decode().strip()) - print(results) - print(model.sot_sequence) if __name__ == "__main__":