Support Parakeet models from NeMo (#1381)

2024-09-27 17:12:00 +08:00
parent 12d04ce8ed
commit 11f0cb7e1c
12 changed files with 160 additions and 8 deletions
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml
@@ -31,7 +31,7 @@ jobs:
        run: |
          BRANCH='main'
          pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
          pip install kaldi-native-fbank
          pip install soundfile librosa
@@ -43,6 +43,43 @@ jobs:
          mv -v sherpa-onnx-nemo* ../../..
      - name: Publish to huggingface
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
        with:
          max_attempts: 20
          timeout_seconds: 200
          shell: bash
          command: |
            git config --global user.email "csukuangfj@gmail.com"
            git config --global user.name "Fangjun Kuang"
            models=(
              sherpa-onnx-nemo-fast-conformer-ctc-en-24500
              sherpa-onnx-nemo-fast-conformer-ctc-es-1424
              sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
              sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
              sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
            )
            for m in ${models[@]}; do
              rm -rf huggingface
              export GIT_LFS_SKIP_SMUDGE=1
              export GIT_CLONE_PROTECTION_ACTIVE=false
              git clone https://huggingface.co/csukuangfj/$m huggingface
              cp -av $m/* huggingface
              cd huggingface
              git lfs track "*.onnx"
              git status
              git add .
              git status
              git commit -m "first commit"
              git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
              cd ..
              rm -rf huggingface
            done
      - name: Compress files
        shell: bash
        run: |
@@ -51,6 +88,7 @@ jobs:
            sherpa-onnx-nemo-fast-conformer-ctc-es-1424
            sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
            sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
            sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
          )
          for d in ${dirs[@]}; do
            tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +103,5 @@ jobs:
          repo_name: k2-fsa/sherpa-onnx
          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
          tag: asr-models
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml
@@ -31,7 +31,7 @@ jobs:
        run: |
          BRANCH='main'
          pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
          pip install kaldi-native-fbank
          pip install soundfile librosa
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml
@@ -31,7 +31,7 @@ jobs:
        run: |
          BRANCH='main'
          pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
          pip install kaldi-native-fbank
          pip install soundfile librosa
@@ -43,6 +43,42 @@ jobs:
          mv -v sherpa-onnx-nemo* ../../..
      - name: Publish to huggingface
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
        with:
          max_attempts: 20
          timeout_seconds: 200
          shell: bash
          command: |
            git config --global user.email "csukuangfj@gmail.com"
            git config --global user.name "Fangjun Kuang"
            models=(
              sherpa-onnx-nemo-fast-conformer-transducer-en-24500
              sherpa-onnx-nemo-fast-conformer-transducer-es-1424
              sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
              sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
              sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
            )
            for m in ${models[@]}; do
              rm -rf huggingface
              export GIT_LFS_SKIP_SMUDGE=1
              export GIT_CLONE_PROTECTION_ACTIVE=false
              git clone https://huggingface.co/csukuangfj/$m huggingface
              cp -av $m/* huggingface
              cd huggingface
              git lfs track "*.onnx"
              git status
              git add .
              git status
              git commit -m "first commit"
              git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
              cd ..
            done
      - name: Compress files
        shell: bash
        run: |
@@ -51,6 +87,7 @@ jobs:
            sherpa-onnx-nemo-fast-conformer-transducer-es-1424
            sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
            sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
            sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
          )
          for d in ${dirs[@]}; do
            tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +102,5 @@ jobs:
          repo_name: k2-fsa/sherpa-onnx
          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
          tag: asr-models
--- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml
+++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml
@@ -31,7 +31,7 @@ jobs:
        run: |
          BRANCH='main'
          pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
-          pip install onnxruntime
+          pip install onnxruntime ipython
          pip install kaldi-native-fbank
          pip install soundfile librosa
--- a/.github/workflows/test-build-wheel.yaml
+++ b/.github/workflows/test-build-wheel.yaml
@@ -139,7 +139,7 @@ jobs:
          export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
          export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
          export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
-          export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH
+          export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
          which sherpa-onnx
          sherpa-onnx --help
--- a/.github/workflows/test-pip-install.yaml
+++ b/.github/workflows/test-pip-install.yaml
@@ -104,7 +104,7 @@ jobs:
          export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
          export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
          export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
-          export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH
+          export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
          sherpa-onnx --help
          sherpa-onnx-keyword-spotter --help
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md
@@ -22,4 +22,6 @@ This folder contains scripts for exporting models from
  - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu
  - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc
  - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
 to `sherpa-onnx`.
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh
@@ -9,6 +9,19 @@ log() {
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 # 36000 hours of English data
 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
 name=$(basename $url)
 doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
 log "Process $name at $url"
 ./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
 d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
 mkdir -p $d
 mv -v model.onnx $d/
 mv -v tokens.txt $d/
 ls -lh $d
 # 8500 hours of English speech
 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
 name=$(basename $url)
@@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
 rm spoken-language-identification-test-wavs.tar.bz2
 data=spoken-language-identification-test-wavs
 curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
 mv 2086-149220-0033.wav en.wav
 d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
 python3 ./test-onnx-ctc-non-streaming.py \
  --model $d/model.onnx \
  --tokens $d/tokens.txt \
  --wav $data/en-english.wav
 mkdir -p $d/test_wavs
 cp en.wav $d/test_wavs/0.wav
 cp -v $data/en-english.wav $d/test_wavs/1.wav
 d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500
 python3 ./test-onnx-ctc-non-streaming.py \
  --model $d/model.onnx \
  --tokens $d/tokens.txt \
  --wav $data/en-english.wav
 mkdir -p $d/test_wavs
 cp en.wav $d/test_wavs/0.wav
 cp -v $data/en-english.wav $d/test_wavs
 d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh
@@ -9,6 +9,19 @@ log() {
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 # 36000 hours of English data
 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
 name=$(basename $url)
 doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
 log "Process $name at $url"
 ./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
 d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
 mkdir -p $d
 mv -v *.onnx $d/
 mv -v tokens.txt $d/
 ls -lh $d
 # 8500 hours of English speech
 url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
 name=$(basename $url)
@@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
 rm spoken-language-identification-test-wavs.tar.bz2
 data=spoken-language-identification-test-wavs
 curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
 mv 2086-149220-0033.wav en.wav
 d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
 python3 ./test-onnx-transducer-non-streaming.py \
  --encoder $d/encoder.onnx \
  --decoder $d/decoder.onnx \
  --joiner $d/joiner.onnx \
  --tokens $d/tokens.txt \
  --wav $data/en-english.wav
 python3 ./test-onnx-transducer-non-streaming.py \
  --encoder $d/encoder.onnx \
  --decoder $d/decoder.onnx \
  --joiner $d/joiner.onnx \
  --tokens $d/tokens.txt \
  --wav ./en.wav
 mkdir -p $d/test_wavs
 cp en.wav $d/test_wavs/0.wav
 cp -v $data/en-english.wav $d/test_wavs
 d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500
 python3 ./test-onnx-transducer-non-streaming.py \
  --encoder $d/encoder.onnx \
@@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \
  --tokens $d/tokens.txt \
  --wav $data/en-english.wav
 mkdir -p $d/test_wavs
 cp en.wav $d/test_wavs/0.wav
 cp -v $data/en-english.wav $d/test_wavs
 d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py
@@ -141,7 +141,7 @@ def main():
        assert model.normalize_type == "per_feature", model.normalize_type
        features = torch.from_numpy(features)
        mean = features.mean(dim=1, keepdims=True)
-        stddev = features.std(dim=1, keepdims=True)
+        stddev = features.std(dim=1, keepdims=True) + 1e-5
        features = (features - mean) / stddev
        features = features.numpy()
--- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py
+++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py
@@ -268,7 +268,7 @@ def main():
        assert model.normalize_type == "per_feature", model.normalize_type
        features = torch.from_numpy(features)
        mean = features.mean(dim=1, keepdims=True)
-        stddev = features.std(dim=1, keepdims=True)
+        stddev = features.std(dim=1, keepdims=True) + 1e-5
        features = (features - mean) / stddev
        features = features.numpy()
    print(audio.shape)
--- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
+++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h
@@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
      config_.feat_config.is_mfcc = true;
    }
    if (!config_.model_config.nemo_ctc.model.empty()) {
      config_.feat_config.low_freq = 0;
      config_.feat_config.high_freq = 0;
      config_.feat_config.is_librosa = true;
      config_.feat_config.remove_dc_offset = false;
      config_.feat_config.window_type = "hann";
    }
    if (!config_.model_config.wenet_ctc.model.empty()) {
      // WeNet CTC models assume input samples are in the range
      // [-32768, 32767], so we set normalize_samples to false