Support Parakeet models from NeMo (#1381)
This commit is contained in:
@@ -31,7 +31,7 @@ jobs:
|
||||
run: |
|
||||
BRANCH='main'
|
||||
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
|
||||
pip install onnxruntime
|
||||
pip install onnxruntime ipython
|
||||
pip install kaldi-native-fbank
|
||||
pip install soundfile librosa
|
||||
|
||||
@@ -43,6 +43,43 @@ jobs:
|
||||
|
||||
mv -v sherpa-onnx-nemo* ../../..
|
||||
|
||||
- name: Publish to huggingface
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
uses: nick-fields/retry@v3
|
||||
with:
|
||||
max_attempts: 20
|
||||
timeout_seconds: 200
|
||||
shell: bash
|
||||
command: |
|
||||
git config --global user.email "csukuangfj@gmail.com"
|
||||
git config --global user.name "Fangjun Kuang"
|
||||
|
||||
models=(
|
||||
sherpa-onnx-nemo-fast-conformer-ctc-en-24500
|
||||
sherpa-onnx-nemo-fast-conformer-ctc-es-1424
|
||||
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
|
||||
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
|
||||
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
|
||||
)
|
||||
|
||||
for m in ${models[@]}; do
|
||||
rm -rf huggingface
|
||||
export GIT_LFS_SKIP_SMUDGE=1
|
||||
export GIT_CLONE_PROTECTION_ACTIVE=false
|
||||
git clone https://huggingface.co/csukuangfj/$m huggingface
|
||||
cp -av $m/* huggingface
|
||||
cd huggingface
|
||||
git lfs track "*.onnx"
|
||||
git status
|
||||
git add .
|
||||
git status
|
||||
git commit -m "first commit"
|
||||
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
|
||||
cd ..
|
||||
rm -rf huggingface
|
||||
done
|
||||
|
||||
- name: Compress files
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -51,6 +88,7 @@ jobs:
|
||||
sherpa-onnx-nemo-fast-conformer-ctc-es-1424
|
||||
sherpa-onnx-nemo-fast-conformer-ctc-en-de-es-fr-14288
|
||||
sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
|
||||
sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
|
||||
)
|
||||
for d in ${dirs[@]}; do
|
||||
tar cjvf ${d}.tar.bz2 ./$d
|
||||
@@ -65,3 +103,5 @@ jobs:
|
||||
repo_name: k2-fsa/sherpa-onnx
|
||||
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
|
||||
tag: asr-models
|
||||
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
run: |
|
||||
BRANCH='main'
|
||||
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
|
||||
pip install onnxruntime
|
||||
pip install onnxruntime ipython
|
||||
pip install kaldi-native-fbank
|
||||
pip install soundfile librosa
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
run: |
|
||||
BRANCH='main'
|
||||
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
|
||||
pip install onnxruntime
|
||||
pip install onnxruntime ipython
|
||||
pip install kaldi-native-fbank
|
||||
pip install soundfile librosa
|
||||
|
||||
@@ -43,6 +43,42 @@ jobs:
|
||||
|
||||
mv -v sherpa-onnx-nemo* ../../..
|
||||
|
||||
- name: Publish to huggingface
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
uses: nick-fields/retry@v3
|
||||
with:
|
||||
max_attempts: 20
|
||||
timeout_seconds: 200
|
||||
shell: bash
|
||||
command: |
|
||||
git config --global user.email "csukuangfj@gmail.com"
|
||||
git config --global user.name "Fangjun Kuang"
|
||||
|
||||
models=(
|
||||
sherpa-onnx-nemo-fast-conformer-transducer-en-24500
|
||||
sherpa-onnx-nemo-fast-conformer-transducer-es-1424
|
||||
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
|
||||
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
|
||||
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
|
||||
)
|
||||
|
||||
for m in ${models[@]}; do
|
||||
rm -rf huggingface
|
||||
export GIT_LFS_SKIP_SMUDGE=1
|
||||
export GIT_CLONE_PROTECTION_ACTIVE=false
|
||||
git clone https://huggingface.co/csukuangfj/$m huggingface
|
||||
cp -av $m/* huggingface
|
||||
cd huggingface
|
||||
git lfs track "*.onnx"
|
||||
git status
|
||||
git add .
|
||||
git status
|
||||
git commit -m "first commit"
|
||||
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main
|
||||
cd ..
|
||||
done
|
||||
|
||||
- name: Compress files
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -51,6 +87,7 @@ jobs:
|
||||
sherpa-onnx-nemo-fast-conformer-transducer-es-1424
|
||||
sherpa-onnx-nemo-fast-conformer-transducer-en-de-es-fr-14288
|
||||
sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k
|
||||
sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
|
||||
)
|
||||
for d in ${dirs[@]}; do
|
||||
tar cjvf ${d}.tar.bz2 ./$d
|
||||
@@ -65,3 +102,5 @@ jobs:
|
||||
repo_name: k2-fsa/sherpa-onnx
|
||||
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
|
||||
tag: asr-models
|
||||
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
run: |
|
||||
BRANCH='main'
|
||||
pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
|
||||
pip install onnxruntime
|
||||
pip install onnxruntime ipython
|
||||
pip install kaldi-native-fbank
|
||||
pip install soundfile librosa
|
||||
|
||||
|
||||
2
.github/workflows/test-build-wheel.yaml
vendored
2
.github/workflows/test-build-wheel.yaml
vendored
@@ -139,7 +139,7 @@ jobs:
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
|
||||
|
||||
which sherpa-onnx
|
||||
sherpa-onnx --help
|
||||
|
||||
2
.github/workflows/test-pip-install.yaml
vendored
2
.github/workflows/test-pip-install.yaml
vendored
@@ -104,7 +104,7 @@ jobs:
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.12.5/x64/bin:$PATH
|
||||
export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH
|
||||
|
||||
sherpa-onnx --help
|
||||
sherpa-onnx-keyword-spotter --help
|
||||
|
||||
@@ -22,4 +22,6 @@ This folder contains scripts for exporting models from
|
||||
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc_blend_eu
|
||||
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_multilingual_fastconformer_hybrid_large_pc
|
||||
|
||||
- https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
|
||||
|
||||
to `sherpa-onnx`.
|
||||
|
||||
@@ -9,6 +9,19 @@ log() {
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
# 36000 hours of English data
|
||||
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
|
||||
name=$(basename $url)
|
||||
doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
|
||||
|
||||
log "Process $name at $url"
|
||||
./export-onnx-ctc-non-streaming.py --model $name --doc "$doc"
|
||||
d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
|
||||
mkdir -p $d
|
||||
mv -v model.onnx $d/
|
||||
mv -v tokens.txt $d/
|
||||
ls -lh $d
|
||||
|
||||
# 8500 hours of English speech
|
||||
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
|
||||
name=$(basename $url)
|
||||
@@ -66,12 +79,26 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
|
||||
rm spoken-language-identification-test-wavs.tar.bz2
|
||||
data=spoken-language-identification-test-wavs
|
||||
|
||||
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
||||
mv 2086-149220-0033.wav en.wav
|
||||
|
||||
d=sherpa-onnx-nemo-parakeet_tdt_ctc_110m-en-36000
|
||||
python3 ./test-onnx-ctc-non-streaming.py \
|
||||
--model $d/model.onnx \
|
||||
--tokens $d/tokens.txt \
|
||||
--wav $data/en-english.wav
|
||||
mkdir -p $d/test_wavs
|
||||
|
||||
cp en.wav $d/test_wavs/0.wav
|
||||
cp -v $data/en-english.wav $d/test_wavs/1.wav
|
||||
|
||||
d=sherpa-onnx-nemo-fast-conformer-ctc-en-24500
|
||||
python3 ./test-onnx-ctc-non-streaming.py \
|
||||
--model $d/model.onnx \
|
||||
--tokens $d/tokens.txt \
|
||||
--wav $data/en-english.wav
|
||||
mkdir -p $d/test_wavs
|
||||
cp en.wav $d/test_wavs/0.wav
|
||||
cp -v $data/en-english.wav $d/test_wavs
|
||||
|
||||
d=sherpa-onnx-nemo-fast-conformer-ctc-es-1424
|
||||
|
||||
@@ -9,6 +9,19 @@ log() {
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
# 36000 hours of English data
|
||||
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/parakeet-tdt_ctc-110m
|
||||
name=$(basename $url)
|
||||
doc="parakeet-tdt_ctc-110m is an ASR model that transcribes speech with Punctuations and Capitalizations of the English alphabet. It was trained on 36K hours of English speech collected and prepared by NVIDIA NeMo and Suno teams."
|
||||
|
||||
log "Process $name at $url"
|
||||
./export-onnx-transducer-non-streaming.py --model $name --doc "$doc"
|
||||
d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
|
||||
mkdir -p $d
|
||||
mv -v *.onnx $d/
|
||||
mv -v tokens.txt $d/
|
||||
ls -lh $d
|
||||
|
||||
# 8500 hours of English speech
|
||||
url=https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_fastconformer_hybrid_large_pc
|
||||
name=$(basename $url)
|
||||
@@ -66,6 +79,28 @@ tar xvf spoken-language-identification-test-wavs.tar.bz2
|
||||
rm spoken-language-identification-test-wavs.tar.bz2
|
||||
data=spoken-language-identification-test-wavs
|
||||
|
||||
curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
||||
mv 2086-149220-0033.wav en.wav
|
||||
|
||||
d=sherpa-onnx-nemo-parakeet_tdt_transducer_110m-en-36000
|
||||
python3 ./test-onnx-transducer-non-streaming.py \
|
||||
--encoder $d/encoder.onnx \
|
||||
--decoder $d/decoder.onnx \
|
||||
--joiner $d/joiner.onnx \
|
||||
--tokens $d/tokens.txt \
|
||||
--wav $data/en-english.wav
|
||||
|
||||
python3 ./test-onnx-transducer-non-streaming.py \
|
||||
--encoder $d/encoder.onnx \
|
||||
--decoder $d/decoder.onnx \
|
||||
--joiner $d/joiner.onnx \
|
||||
--tokens $d/tokens.txt \
|
||||
--wav ./en.wav
|
||||
|
||||
mkdir -p $d/test_wavs
|
||||
cp en.wav $d/test_wavs/0.wav
|
||||
cp -v $data/en-english.wav $d/test_wavs
|
||||
|
||||
d=sherpa-onnx-nemo-fast-conformer-transducer-en-24500
|
||||
python3 ./test-onnx-transducer-non-streaming.py \
|
||||
--encoder $d/encoder.onnx \
|
||||
@@ -74,6 +109,7 @@ python3 ./test-onnx-transducer-non-streaming.py \
|
||||
--tokens $d/tokens.txt \
|
||||
--wav $data/en-english.wav
|
||||
mkdir -p $d/test_wavs
|
||||
cp en.wav $d/test_wavs/0.wav
|
||||
cp -v $data/en-english.wav $d/test_wavs
|
||||
|
||||
d=sherpa-onnx-nemo-fast-conformer-transducer-es-1424
|
||||
|
||||
@@ -141,7 +141,7 @@ def main():
|
||||
assert model.normalize_type == "per_feature", model.normalize_type
|
||||
features = torch.from_numpy(features)
|
||||
mean = features.mean(dim=1, keepdims=True)
|
||||
stddev = features.std(dim=1, keepdims=True)
|
||||
stddev = features.std(dim=1, keepdims=True) + 1e-5
|
||||
features = (features - mean) / stddev
|
||||
features = features.numpy()
|
||||
|
||||
|
||||
@@ -268,7 +268,7 @@ def main():
|
||||
assert model.normalize_type == "per_feature", model.normalize_type
|
||||
features = torch.from_numpy(features)
|
||||
mean = features.mean(dim=1, keepdims=True)
|
||||
stddev = features.std(dim=1, keepdims=True)
|
||||
stddev = features.std(dim=1, keepdims=True) + 1e-5
|
||||
features = (features - mean) / stddev
|
||||
features = features.numpy()
|
||||
print(audio.shape)
|
||||
|
||||
@@ -103,6 +103,14 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl {
|
||||
config_.feat_config.is_mfcc = true;
|
||||
}
|
||||
|
||||
if (!config_.model_config.nemo_ctc.model.empty()) {
|
||||
config_.feat_config.low_freq = 0;
|
||||
config_.feat_config.high_freq = 0;
|
||||
config_.feat_config.is_librosa = true;
|
||||
config_.feat_config.remove_dc_offset = false;
|
||||
config_.feat_config.window_type = "hann";
|
||||
}
|
||||
|
||||
if (!config_.model_config.wenet_ctc.model.empty()) {
|
||||
// WeNet CTC models assume input samples are in the range
|
||||
// [-32768, 32767], so we set normalize_samples to false
|
||||
|
||||
Reference in New Issue
Block a user