diff --git a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml index 9cb034ee..138c708a 100644 --- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml +++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc-non-streaming.yaml @@ -31,7 +31,7 @@ jobs: run: | BRANCH='main' pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] - pip install onnxruntime + pip install onnxruntime ipython pip install kaldi-native-fbank pip install soundfile librosa @@ -43,6 +43,43 @@ jobs: mv -v sherpa-onnx-nemo* ../../.. + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + models=( + sherpa-onnx-nemo-fast-conformer-ctc-en-24500 + sherpa-onnx-nemo-fast-conformer-ctc-es-1424 + sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288 + sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k + sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 + ) + + for m in ${models[@]}; do + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + git clone https://huggingface.co/csukuangfj/$m huggingface + cp -av $m/* huggingface + cd huggingface + git lfs track "*.onnx" + git status + git add . + git status + git commit -m "first commit" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main + cd .. + rm -rf huggingface + done + - name: Compress files shell: bash run: | @@ -51,6 +88,7 @@ jobs: sherpa-onnx-nemo-fast-conformer-ctc-es-1424 sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288 sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k + sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 ) for d in ${dirs[@]}; do tar cjvf ${d}.tar.bz2 ./$d @@ -65,3 +103,5 @@ jobs: repo_name: k2-fsa/sherpa-onnx repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} tag: asr-models + + diff --git a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml index 4b04f554..8c40a558 100644 --- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml +++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-ctc.yaml @@ -31,7 +31,7 @@ jobs: run: | BRANCH='main' pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] - pip install onnxruntime + pip install onnxruntime ipython pip install kaldi-native-fbank pip install soundfile librosa diff --git a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml index 9874cfe4..7a7b7fc4 100644 --- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml +++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer-non-streaming.yaml @@ -31,7 +31,7 @@ jobs: run: | BRANCH='main' pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] - pip install onnxruntime + pip install onnxruntime ipython pip install kaldi-native-fbank pip install soundfile librosa @@ -43,6 +43,42 @@ jobs: mv -v sherpa-onnx-nemo* ../../.. + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + models=( + sherpa-onnx-nemo-fast-conformer-transducer-en-24500 + sherpa-onnx-nemo-fast-conformer-transducer-es-1424 + sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288 + sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k + sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 + ) + + for m in ${models[@]}; do + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + git clone https://huggingface.co/csukuangfj/$m huggingface + cp -av $m/* huggingface + cd huggingface + git lfs track "*.onnx" + git status + git add . + git status + git commit -m "first commit" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main + cd .. + done + - name: Compress files shell: bash run: | @@ -51,6 +87,7 @@ jobs: sherpa-onnx-nemo-fast-conformer-transducer-es-1424 sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288 sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k + sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 ) for d in ${dirs[@]}; do tar cjvf ${d}.tar.bz2 ./$d @@ -65,3 +102,5 @@ jobs: repo_name: k2-fsa/sherpa-onnx repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} tag: asr-models + + diff --git a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml index d3744049..477de451 100644 --- a/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml +++ b/.github/workflows/export-nemo-fast-conformer-hybrid-transducer-transducer.yaml @@ -31,7 +31,7 @@ jobs: run: | BRANCH='main' pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] - pip install onnxruntime + pip install onnxruntime ipython pip install kaldi-native-fbank pip install soundfile librosa diff --git a/.github/workflows/test-build-wheel.yaml b/.github/workflows/test-build-wheel.yaml index adc4c29a..a9b2db58 100644 --- a/.github/workflows/test-build-wheel.yaml +++ b/.github/workflows/test-build-wheel.yaml @@ -139,7 +139,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH which sherpa-onnx sherpa-onnx --help diff --git a/.github/workflows/test-pip-install.yaml b/.github/workflows/test-pip-install.yaml index 9ae727b2..0f73e364 100644 --- a/.github/workflows/test-pip-install.yaml +++ b/.github/workflows/test-pip-install.yaml @@ -104,7 +104,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH sherpa-onnx --help sherpa-onnx-keyword-spotter --help diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md index 6deccc7a..d156d7e3 100644 --- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/README.md @@ -22,4 +22,6 @@ This folder contains scripts for exporting models from - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc + - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m + to `sherpa-onnx`. diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh index d61e2c7b..335f6449 100755 --- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-ctc-non-streaming.sh @@ -9,6 +9,19 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } +# 36000 hours of English data +url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m +name=$(basename $url) +doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams." + +log "Process $name at $url" +./export-onnx-ctc-non-streaming.py --model $name --doc "$doc" +d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 +mkdir -p $d +mv -v model.onnx $d/ +mv -v tokens.txt $d/ +ls -lh $d + # 8500 hours of English speech url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc name=$(basename $url) @@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 rm spoken-language-identification-test-wavs.tar.bz2 data=spoken-language-identification-test-wavs +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav +mv 2086-149220-0033.wav en.wav + +d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000 +python3 ./test-onnx-ctc-non-streaming.py \ + --model $d/model.onnx \ + --tokens $d/tokens.txt \ + --wav $data/en-english.wav +mkdir -p $d/test_wavs + +cp en.wav $d/test_wavs/0.wav +cp -v $data/en-english.wav $d/test_wavs/1.wav + d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500 python3 ./test-onnx-ctc-non-streaming.py \ --model $d/model.onnx \ --tokens $d/tokens.txt \ --wav $data/en-english.wav mkdir -p $d/test_wavs +cp en.wav $d/test_wavs/0.wav cp -v $data/en-english.wav $d/test_wavs d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424 diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh index 7f6a6d45..059d97ce 100755 --- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/run-transducer-non-streaming.sh @@ -9,6 +9,19 @@ log() { echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } +# 36000 hours of English data +url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m +name=$(basename $url) +doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams." + +log "Process $name at $url" +./export-onnx-transducer-non-streaming.py --model $name --doc "$doc" +d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 +mkdir -p $d +mv -v *.onnx $d/ +mv -v tokens.txt $d/ +ls -lh $d + # 8500 hours of English speech url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc name=$(basename $url) @@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2 rm spoken-language-identification-test-wavs.tar.bz2 data=spoken-language-identification-test-wavs +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav +mv 2086-149220-0033.wav en.wav + +d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000 +python3 ./test-onnx-transducer-non-streaming.py \ + --encoder $d/encoder.onnx \ + --decoder $d/decoder.onnx \ + --joiner $d/joiner.onnx \ + --tokens $d/tokens.txt \ + --wav $data/en-english.wav + +python3 ./test-onnx-transducer-non-streaming.py \ + --encoder $d/encoder.onnx \ + --decoder $d/decoder.onnx \ + --joiner $d/joiner.onnx \ + --tokens $d/tokens.txt \ + --wav ./en.wav + +mkdir -p $d/test_wavs +cp en.wav $d/test_wavs/0.wav +cp -v $data/en-english.wav $d/test_wavs + d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500 python3 ./test-onnx-transducer-non-streaming.py \ --encoder $d/encoder.onnx \ @@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \ --tokens $d/tokens.txt \ --wav $data/en-english.wav mkdir -p $d/test_wavs +cp en.wav $d/test_wavs/0.wav cp -v $data/en-english.wav $d/test_wavs d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424 diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py index 89654394..9eb91c88 100755 --- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-ctc-non-streaming.py @@ -141,7 +141,7 @@ def main(): assert model.normalize_type == "per_feature", model.normalize_type features = torch.from_numpy(features) mean = features.mean(dim=1, keepdims=True) - stddev = features.std(dim=1, keepdims=True) + stddev = features.std(dim=1, keepdims=True) + 1e-5 features = (features - mean) / stddev features = features.numpy() diff --git a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py index 34140293..b0d23b5a 100755 --- a/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py +++ b/scripts/nemo/fast-conformer-hybrid-transducer-ctc/test-onnx-transducer-non-streaming.py @@ -268,7 +268,7 @@ def main(): assert model.normalize_type == "per_feature", model.normalize_type features = torch.from_numpy(features) mean = features.mean(dim=1, keepdims=True) - stddev = features.std(dim=1, keepdims=True) + stddev = features.std(dim=1, keepdims=True) + 1e-5 features = (features - mean) / stddev features = features.numpy() print(audio.shape) diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index 05c1b798..7bbe6938 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { config_.feat_config.is_mfcc = true; } + if (!config_.model_config.nemo_ctc.model.empty()) { + config_.feat_config.low_freq = 0; + config_.feat_config.high_freq = 0; + config_.feat_config.is_librosa = true; + config_.feat_config.remove_dc_offset = false; + config_.feat_config.window_type = "hann"; + } + if (!config_.model_config.wenet_ctc.model.empty()) { // WeNet CTC models assume input samples are in the range // [-32768, 32767], so we set normalize_samples to false