Support distil-small.en whisper (#472)

2023-12-08 11:59:20 +08:00
parent 3ae984f148
commit 868c339e5e
7 changed files with 84 additions and 24 deletions
--- a/.github/scripts/test-offline-whisper.sh
+++ b/.github/scripts/test-offline-whisper.sh
@@ -22,6 +22,8 @@ tiny
 base
 small
 medium
 distil-medium.en
 distil-small.en
 )
 for name in ${names[@]}; do
--- a/.github/workflows/export-whisper-to-onnx.yaml
+++ b/.github/workflows/export-whisper-to-onnx.yaml
@@ -15,8 +15,9 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest]
+        os: [macos-latest]
-        model: ["distil-medium.en", "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
+        # model: ["distil-medium.en", "distil-small.en",  "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large", "large-v1", "large-v2", "distil-large-v2"]
        model: ["distil-medium.en", "distil-small.en",  "tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium"]
        python-version: ["3.8"]
    steps:
@@ -42,23 +43,33 @@ jobs:
          if [[ $model == distil-medium.en ]]; then
            wget -q -O distil-medium-en-original-model.bin https://huggingface.co/distil-whisper/distil-medium.en/resolve/main/original-model.bin
            ls -lh
          elif [[ $model == distil-large-v2 ]]; then
            wget -q -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin
            ls -lh
          elif [[ $model == distil-small.en ]]; then
            wget -q -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin
            ls -lh
          fi
          python3 ./export-onnx.py --model ${{ matrix.model }}
          # python3 -m onnxruntime.tools.convert_onnx_models_to_ort --optimization_style=Fixed ./
          ls -lh
-          if [[ $model != distil-medium.en ]]; then
+          ls -lh ~/.cache/whisper || true
-            ls -lh ~/.cache/whisper
+          ls -lh distil*original-model.bin || true
-          fi
+          rm -rf ~/.cache/whisper
          rm -f distil*original-model.bin
          src=sherpa-onnx-whisper-${{ matrix.model }}
-          mkdir $src
+          cd ..
-          cp *.onnx $src/
+          mv whisper $src
-          cp *tokens.txt $src
+
          echo "------------------------------"
          cd $src
          du -h -d1 .
          ls -lh
          mkdir -p test_wavs
          cd test_wavs
          wget -q https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/0.wav
@@ -66,21 +77,32 @@ jobs:
          wget -q https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/8k.wav
          wget -q https://huggingface.co/csukuangfj/sherpa-onnx-whisper-medium.en/resolve/main/test_wavs/trans.txt
          cd ../..
-          mv $src ../..
+          mv $src ../
          echo "pwd: $PWD"
-          cd ../..
+          cd ../
          echo "--------------------"
          ls -lh
          ls -lh $src
          echo "--------------------"
-          tar cjvf ./$src.tar.bz2 $src
+          if [[ $model == large || $model == large-v1 || $model == large-v2 || $model == distil-large-v2 ]]; then
            #tar cvjf - $src | split --bytes=1024MB - $src.tar.bz2.
            tar cvjf $src.tar.bz2 $src
            split -b 1G $src.tar.bz2 $src.tar.bz2.
            rm $src.tar.bz2
            # cat $src.tar.gz.* | tar xjf -
          else
            tar cvjf $src.tar.bz2 $src
          fi
          ls -lh
      - name: Release
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
-          file: ./*.tar.bz2
+          file: ./*.tar*
          overwrite: true
          repo_name: k2-fsa/sherpa-onnx
          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
@@ -99,14 +121,21 @@ jobs:
          GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} huggingface
          rm -rf huggingface/*
-          cp -av $src/* ./huggingface/
+          if [[ $model == large || $model == large-v1 || $model == large-v2 || $model == distil-large-v2 ]]; then
            mv $src.tar* ./huggingface
          else
            cp -v $src/*.onnx ./huggingface
            cp -v $src/*tokens* ./huggingface
            cp -av $src/test_wavs ./huggingface
          fi
          cd huggingface
          git status
          ls -lh
-          git lfs track "*.onnx"
+          git lfs track "*gz*"
-          # git lfs track "*.ort"
+          git lfs track "*onnx*"
          git add .
          git commit -m "upload ${{ matrix.model }}"
          git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-whisper-${{ matrix.model }} main
--- a/.github/workflows/test-python-offline-websocket-server.yaml
+++ b/.github/workflows/test-python-offline-websocket-server.yaml
@@ -90,7 +90,7 @@ jobs:
            ./sherpa-onnx-zipformer-en-2023-06-26/test_wavs/8k.wav
      - name: Start server for paraformer models
-        if: matrix.model_type == 'paraformer'
+        if: matrix.model_type == 'paraformer' && matrix.os != 'windows-latest'
        shell: bash
        run: |
          GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-bilingual-zh-en
@@ -106,7 +106,7 @@ jobs:
          sleep 10
      - name: Start client for paraformer models
-        if: matrix.model_type == 'paraformer'
+        if: matrix.model_type == 'paraformer' && matrix.os != 'windows-latest'
        shell: bash
        run: |
          python3 ./python-api-examples/offline-websocket-client-decode-files-paralell.py \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-onnx)
-set(SHERPA_ONNX_VERSION "1.9.0")
+set(SHERPA_ONNX_VERSION "1.9.1")
 # Disable warning about
 #
--- a/scripts/whisper/export-onnx.py
+++ b/scripts/whisper/export-onnx.py
@@ -44,7 +44,7 @@ def get_args():
            "tiny", "tiny.en", "base", "base.en",
            "small", "small.en", "medium", "medium.en",
            "large", "large-v1", "large-v2",
-            "distil-medium.en",
+            "distil-medium.en", "distil-small.en", "distil-large-v2"
            ],
        # fmt: on
    )
@@ -314,6 +314,32 @@ def main():
            """
            )
        model = whisper.load_model(filename)
    elif name == "distil-large-v2":
        filename = "./distil-large-v2-original-model.bin"
        if not Path(filename).is_file():
            raise ValueError(
                """
                Please go to https://huggingface.co/distil-whisper/distil-large-v2
                to download original-model.bin
                You can use the following command to do that:
                wget -O distil-large-v2-original-model.bin https://huggingface.co/distil-whisper/distil-large-v2/resolve/main/original-model.bin
            """
            )
        model = whisper.load_model(filename)
    elif name == "distil-small.en":
        filename = "./distil-small-en-original-model.bin"
        if not Path(filename).is_file():
            raise ValueError(
                """
                Please go to https://huggingface.co/distil-whisper/distil-small.en
                to download original-model.bin
                You can use the following command to do that:
                wget -O distil-small-en-original-model.bin https://huggingface.co/distil-whisper/distil-small.en/resolve/main/original-model.bin
            """
            )
        model = whisper.load_model(filename)
    else:
        model = whisper.load_model(name)
    print(model.dims)
--- a/scripts/whisper/test.py
+++ b/scripts/whisper/test.py
@@ -209,7 +209,7 @@ class OnnxModel:
        logits = logits.reshape(-1)
        mask = torch.ones(logits.shape[0], dtype=torch.int64)
        mask[self.all_language_tokens] = 0
-        logits[mask] = float("-inf")
+        logits[mask != 0] = float("-inf")
        lang_id = logits.argmax().item()
        print("detected language: ", self.id2lang[lang_id])
        return lang_id
@@ -263,7 +263,9 @@ def compute_features(filename: str) -> torch.Tensor:
    target = 3000
    if mel.shape[0] > target:
-        mel = mel[:target]
+        # -50 so that there are some zero tail paddings.
        mel = mel[: target - 50]
        mel = torch.nn.functional.pad(mel, (0, 0, 0, 50), "constant", 0)
    # We don't need to pad it to 30 seconds now!
    #  mel = torch.nn.functional.pad(mel, (0, 0, 0, target - mel.shape[0]), "constant", 0)
--- a/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h
+++ b/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h
@@ -106,11 +106,12 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
    std::vector<float> f = s->GetFrames();
    int32_t num_frames = f.size() / feat_dim;
-    if (num_frames > max_num_frames) {
+    // we use 50 here so that there will be some zero tail paddings
    if (num_frames >= max_num_frames - 50) {
      SHERPA_ONNX_LOGE(
          "Only waves less than 30 seconds are supported. We process only the "
          "first 30 seconds and discard the remaining data");
-      num_frames = max_num_frames;
+      num_frames = max_num_frames - 50;
    }
    NormalizeFeatures(f.data(), num_frames, feat_dim);
@@ -140,7 +141,7 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl {
    Ort::Value mel = Ort::Value::CreateTensor<float>(
        model_->Allocator(), shape.data(), shape.size());
    float *p_mel = mel.GetTensorMutableData<float>();
-    std::copy(f.begin(), f.end(), p_mel);
+    std::copy(f.data(), f.data() + actual_frames * feat_dim, p_mel);
    memset(p_mel + f.size(), 0,
           (actual_frames - num_frames) * feat_dim * sizeof(float));