Support Parakeet models from NeMo (#1381)

This commit is contained in:
Fangjun Kuang
2024-09-27 17:12:00 +08:00
committed by GitHub
parent 12d04ce8ed
commit 11f0cb7e1c
12 changed files with 160 additions and 8 deletions

View File

@@ -31,7 +31,7 @@ jobs:
run: | run: |
BRANCH='main' BRANCH='main'
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
pip install onnxruntime pip install onnxruntime ipython
pip install kaldi-native-fbank pip install kaldi-native-fbank
pip install soundfile librosa pip install soundfile librosa
@@ -43,6 +43,43 @@ jobs:
mv -v sherpa-onnx-nemo* ../../.. mv -v sherpa-onnx-nemo* ../../..
- name: Publish to huggingface
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
models=(
sherpa-onnx-nemo-fast-conformer-ctc-en-24500
sherpa-onnx-nemo-fast-conformer-ctc-es-1424
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
)
for m in ${models[@]}; do
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://huggingface.co/csukuangfj/$m huggingface
cp -av $m/* huggingface
cd huggingface
git lfs track "*.onnx"
git status
git add .
git status
git commit -m "first commit"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
cd ..
rm -rf huggingface
done
- name: Compress files - name: Compress files
shell: bash shell: bash
run: | run: |
@@ -51,6 +88,7 @@ jobs:
sherpa-onnx-nemo-fast-conformer-ctc-es-1424 sherpa-onnx-nemo-fast-conformer-ctc-es-1424
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288 sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
) )
for d in ${dirs[@]}; do for d in ${dirs[@]}; do
tar cjvf ${d}.tar.bz2 ./$d tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +103,5 @@ jobs:
repo_name: k2-fsa/sherpa-onnx repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: asr-models tag: asr-models

View File

@@ -31,7 +31,7 @@ jobs:
run: | run: |
BRANCH='main' BRANCH='main'
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
pip install onnxruntime pip install onnxruntime ipython
pip install kaldi-native-fbank pip install kaldi-native-fbank
pip install soundfile librosa pip install soundfile librosa

View File

@@ -31,7 +31,7 @@ jobs:
run: | run: |
BRANCH='main' BRANCH='main'
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
pip install onnxruntime pip install onnxruntime ipython
pip install kaldi-native-fbank pip install kaldi-native-fbank
pip install soundfile librosa pip install soundfile librosa
@@ -43,6 +43,42 @@ jobs:
mv -v sherpa-onnx-nemo* ../../.. mv -v sherpa-onnx-nemo* ../../..
- name: Publish to huggingface
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
models=(
sherpa-onnx-nemo-fast-conformer-transducer-en-24500
sherpa-onnx-nemo-fast-conformer-transducer-es-1424
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
)
for m in ${models[@]}; do
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://huggingface.co/csukuangfj/$m huggingface
cp -av $m/* huggingface
cd huggingface
git lfs track "*.onnx"
git status
git add .
git status
git commit -m "first commit"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
cd ..
done
- name: Compress files - name: Compress files
shell: bash shell: bash
run: | run: |
@@ -51,6 +87,7 @@ jobs:
sherpa-onnx-nemo-fast-conformer-transducer-es-1424 sherpa-onnx-nemo-fast-conformer-transducer-es-1424
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288 sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
) )
for d in ${dirs[@]}; do for d in ${dirs[@]}; do
tar cjvf ${d}.tar.bz2 ./$d tar cjvf ${d}.tar.bz2 ./$d
@@ -65,3 +102,5 @@ jobs:
repo_name: k2-fsa/sherpa-onnx repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: asr-models tag: asr-models

View File

@@ -31,7 +31,7 @@ jobs:
run: | run: |
BRANCH='main' BRANCH='main'
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr] pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
pip install onnxruntime pip install onnxruntime ipython
pip install kaldi-native-fbank pip install kaldi-native-fbank
pip install soundfile librosa pip install soundfile librosa

View File

@@ -139,7 +139,7 @@ jobs:
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
which sherpa-onnx which sherpa-onnx
sherpa-onnx --help sherpa-onnx --help

View File

@@ -104,7 +104,7 @@ jobs:
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
sherpa-onnx --help sherpa-onnx --help
sherpa-onnx-keyword-spotter --help sherpa-onnx-keyword-spotter --help

View File

@@ -22,4 +22,6 @@ This folder contains scripts for exporting models from
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc - https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
to `sherpa-onnx`. to `sherpa-onnx`.

View File

@@ -9,6 +9,19 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
} }
# 36000 hours of English data
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
name=$(basename $url)
doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
log "Process $name at $url"
./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
mkdir -p $d
mv -v model.onnx $d/
mv -v tokens.txt $d/
ls -lh $d
# 8500 hours of English speech # 8500 hours of English speech
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
name=$(basename $url) name=$(basename $url)
@@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
rm spoken-language-identification-test-wavs.tar.bz2 rm spoken-language-identification-test-wavs.tar.bz2
data=spoken-language-identification-test-wavs data=spoken-language-identification-test-wavs
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
mv 2086-149220-0033.wav en.wav
d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
python3 ./test-onnx-ctc-non-streaming.py \
--model $d/model.onnx \
--tokens $d/tokens.txt \
--wav $data/en-english.wav
mkdir -p $d/test_wavs
cp en.wav $d/test_wavs/0.wav
cp -v $data/en-english.wav $d/test_wavs/1.wav
d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500 d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500
python3 ./test-onnx-ctc-non-streaming.py \ python3 ./test-onnx-ctc-non-streaming.py \
--model $d/model.onnx \ --model $d/model.onnx \
--tokens $d/tokens.txt \ --tokens $d/tokens.txt \
--wav $data/en-english.wav --wav $data/en-english.wav
mkdir -p $d/test_wavs mkdir -p $d/test_wavs
cp en.wav $d/test_wavs/0.wav
cp -v $data/en-english.wav $d/test_wavs cp -v $data/en-english.wav $d/test_wavs
d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424 d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424

View File

@@ -9,6 +9,19 @@ log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
} }
# 36000 hours of English data
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
name=$(basename $url)
doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
log "Process $name at $url"
./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
mkdir -p $d
mv -v *.onnx $d/
mv -v tokens.txt $d/
ls -lh $d
# 8500 hours of English speech # 8500 hours of English speech
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
name=$(basename $url) name=$(basename $url)
@@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
rm spoken-language-identification-test-wavs.tar.bz2 rm spoken-language-identification-test-wavs.tar.bz2
data=spoken-language-identification-test-wavs data=spoken-language-identification-test-wavs
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
mv 2086-149220-0033.wav en.wav
d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
python3 ./test-onnx-transducer-non-streaming.py \
--encoder $d/encoder.onnx \
--decoder $d/decoder.onnx \
--joiner $d/joiner.onnx \
--tokens $d/tokens.txt \
--wav $data/en-english.wav
python3 ./test-onnx-transducer-non-streaming.py \
--encoder $d/encoder.onnx \
--decoder $d/decoder.onnx \
--joiner $d/joiner.onnx \
--tokens $d/tokens.txt \
--wav ./en.wav
mkdir -p $d/test_wavs
cp en.wav $d/test_wavs/0.wav
cp -v $data/en-english.wav $d/test_wavs
d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500 d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500
python3 ./test-onnx-transducer-non-streaming.py \ python3 ./test-onnx-transducer-non-streaming.py \
--encoder $d/encoder.onnx \ --encoder $d/encoder.onnx \
@@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \
--tokens $d/tokens.txt \ --tokens $d/tokens.txt \
--wav $data/en-english.wav --wav $data/en-english.wav
mkdir -p $d/test_wavs mkdir -p $d/test_wavs
cp en.wav $d/test_wavs/0.wav
cp -v $data/en-english.wav $d/test_wavs cp -v $data/en-english.wav $d/test_wavs
d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424 d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424

View File

@@ -141,7 +141,7 @@ def main():
assert model.normalize_type == "per_feature", model.normalize_type assert model.normalize_type == "per_feature", model.normalize_type
features = torch.from_numpy(features) features = torch.from_numpy(features)
mean = features.mean(dim=1, keepdims=True) mean = features.mean(dim=1, keepdims=True)
stddev = features.std(dim=1, keepdims=True) stddev = features.std(dim=1, keepdims=True) + 1e-5
features = (features - mean) / stddev features = (features - mean) / stddev
features = features.numpy() features = features.numpy()

View File

@@ -268,7 +268,7 @@ def main():
assert model.normalize_type == "per_feature", model.normalize_type assert model.normalize_type == "per_feature", model.normalize_type
features = torch.from_numpy(features) features = torch.from_numpy(features)
mean = features.mean(dim=1, keepdims=True) mean = features.mean(dim=1, keepdims=True)
stddev = features.std(dim=1, keepdims=True) stddev = features.std(dim=1, keepdims=True) + 1e-5
features = (features - mean) / stddev features = (features - mean) / stddev
features = features.numpy() features = features.numpy()
print(audio.shape) print(audio.shape)

View File

@@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
config_.feat_config.is_mfcc = true; config_.feat_config.is_mfcc = true;
} }
if (!config_.model_config.nemo_ctc.model.empty()) {
config_.feat_config.low_freq = 0;
config_.feat_config.high_freq = 0;
config_.feat_config.is_librosa = true;
config_.feat_config.remove_dc_offset = false;
config_.feat_config.window_type = "hann";
}
if (!config_.model_config.wenet_ctc.model.empty()) { if (!config_.model_config.wenet_ctc.model.empty()) {
// WeNet CTC models assume input samples are in the range // WeNet CTC models assume input samples are in the range
// [-32768, 32767], so we set normalize_samples to false // [-32768, 32767], so we set normalize_samples to false