diff --git a/.github/workflows/export-nemo-parakeet-tdt-0.6b-v2.yaml b/.github/workflows/export-nemo-parakeet-tdt-0.6b-v2.yaml new file mode 100644 index 00000000..b84528ff --- /dev/null +++ b/.github/workflows/export-nemo-parakeet-tdt-0.6b-v2.yaml @@ -0,0 +1,131 @@ +name: export-nemo-parakeet-tdt-0.6b-v2 + +on: + push: + branches: + - export-nemo-parakeet-tdt-0.6b-v2 + workflow_dispatch: + +concurrency: + group: export-nemo-parakeet-tdt-0.6b-v2-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-nemo-parakeet-tdt-0_6b-v2: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: parakeet tdt 0.6b v2 + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Run + shell: bash + run: | + cd scripts/nemo/parakeet-tdt-0.6b-v2 + ./run.sh + + ls -lh *.onnx + mv -v *.onnx ../../.. + mv -v tokens.txt ../../.. + mv 2086-149220-0033.wav ../../../0.wav + + - name: Collect files (fp32) + shell: bash + run: | + d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2 + mkdir -p $d + cp encoder.int8.onnx $d + cp decoder.onnx $d + cp joiner.onnx $d + cp tokens.txt $d + + mkdir $d/test_wavs + cp 0.wav $d/test_wavs + + tar cjfv $d.tar.bz2 $d + + - name: Collect files (int8) + shell: bash + run: | + d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8 + mkdir -p $d + cp encoder.int8.onnx $d + cp decoder.int8.onnx $d + cp joiner.int8.onnx $d + cp tokens.txt $d + + mkdir $d/test_wavs + cp 0.wav $d/test_wavs + + tar cjfv $d.tar.bz2 $d + + - name: Collect files (fp16) + shell: bash + run: | + d=sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16 + mkdir -p $d + cp encoder.fp16.onnx $d + cp decoder.fp16.onnx $d + cp joiner.fp16.onnx $d + cp tokens.txt $d + + mkdir $d/test_wavs + cp 0.wav $d/test_wavs + + tar cjfv $d.tar.bz2 $d + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + models=( + sherpa-onnx-nemo-parakeet-tdt-0.6b-v2 + sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8 + sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-fp16 + ) + + for m in ${models[@]}; do + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m huggingface + cp -av $m/* huggingface + cd huggingface + git lfs track "*.onnx" + git lfs track "*.wav" + git status + git add . + git status + git commit -m "first commit" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/$m main + cd .. + done + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models diff --git a/scripts/nemo/parakeet-tdt-0.6b-v2/export_onnx.py b/scripts/nemo/parakeet-tdt-0.6b-v2/export_onnx.py new file mode 100755 index 00000000..e914dc88 --- /dev/null +++ b/scripts/nemo/parakeet-tdt-0.6b-v2/export_onnx.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +from pathlib import Path +from typing import Dict +import os + +import nemo.collections.asr as nemo_asr +import onnx +import onnxmltools +import torch +from onnxmltools.utils.float16_converter import ( + convert_float_to_float16, + convert_float_to_float16_model_path, +) +from onnxruntime.quantization import QuantType, quantize_dynamic + + +def export_onnx_fp16(onnx_fp32_path, onnx_fp16_path): + onnx_fp32_model = onnxmltools.utils.load_model(onnx_fp32_path) + onnx_fp16_model = convert_float_to_float16(onnx_fp32_model, keep_io_types=True) + onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path) + + +def export_onnx_fp16_large_2gb(onnx_fp32_path, onnx_fp16_path): + onnx_fp16_model = convert_float_to_float16_model_path( + onnx_fp32_path, keep_io_types=True + ) + onnxmltools.utils.save_model(onnx_fp16_model, onnx_fp16_path) + + +def add_meta_data(filename: str, meta_data: Dict[str, str]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +@torch.no_grad() +def main(): + asr_model = nemo_asr.models.ASRModel.from_pretrained( + model_name="nvidia/parakeet-tdt-0.6b-v2" + ) + + asr_model.eval() + + with open("./tokens.txt", "w", encoding="utf-8") as f: + for i, s in enumerate(asr_model.joint.vocabulary): + f.write(f"{s} {i}\n") + f.write(f" {i+1}\n") + print("Saved to tokens.txt") + + asr_model.encoder.export("encoder.onnx") + asr_model.decoder.export("decoder.onnx") + asr_model.joint.export("joiner.onnx") + os.system("ls -lh *.onnx") + + normalize_type = asr_model.cfg.preprocessor.normalize + if normalize_type == "NA": + normalize_type = "" + + meta_data = { + "vocab_size": asr_model.decoder.vocab_size, + "normalize_type": normalize_type, + "pred_rnn_layers": asr_model.decoder.pred_rnn_layers, + "pred_hidden": asr_model.decoder.pred_hidden, + "subsampling_factor": 8, + "model_type": "EncDecRNNTBPEModel", + "version": "2", + "model_author": "NeMo", + "url": "https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2", + "comment": "Only the transducer branch is exported", + "feat_dim": 128, + } + + for m in ["encoder", "decoder", "joiner"]: + quantize_dynamic( + model_input=f"./{m}.onnx", + model_output=f"./{m}.int8.onnx", + weight_type=QuantType.QUInt8 if m == "encoder" else QuantType.QInt8, + ) + os.system("ls -lh *.onnx") + + if m == "encoder": + export_onnx_fp16_large_2gb(f"{m}.onnx", f"{m}.fp16.onnx") + else: + export_onnx_fp16(f"{m}.onnx", f"{m}.fp16.onnx") + + add_meta_data("encoder.int8.onnx", meta_data) + add_meta_data("encoder.fp16.onnx", meta_data) + print("meta_data", meta_data) + + +if __name__ == "__main__": + main() diff --git a/scripts/nemo/parakeet-tdt-0.6b-v2/run.sh b/scripts/nemo/parakeet-tdt-0.6b-v2/run.sh new file mode 100755 index 00000000..ec670152 --- /dev/null +++ b/scripts/nemo/parakeet-tdt-0.6b-v2/run.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +curl -SL -O https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav + + + +pip install \ + nemo_toolkit['asr'] \ + "numpy<2" \ + ipython \ + kaldi-native-fbank \ + librosa \ + onnx==1.17.0 \ + onnxmltools \ + onnxruntime==1.17.1 \ + soundfile + +python3 ./export_onnx.py +ls -lh *.onnx + +echo "---fp32----" +python3 ./test_onnx.py \ + --encoder ./encoder.int8.onnx \ + --decoder ./decoder.onnx \ + --joiner ./joiner.onnx \ + --tokens ./tokens.txt \ + --wav 2086-149220-0033.wav + +echo "---int8----" +python3 ./test_onnx.py \ + --encoder ./encoder.int8.onnx \ + --decoder ./decoder.int8.onnx \ + --joiner ./joiner.int8.onnx \ + --tokens ./tokens.txt \ + --wav 2086-149220-0033.wav + +echo "---fp16----" +python3 ./test_onnx.py \ + --encoder ./encoder.fp16.onnx \ + --decoder ./decoder.fp16.onnx \ + --joiner ./joiner.fp16.onnx \ + --tokens ./tokens.txt \ + --wav 2086-149220-0033.wav diff --git a/scripts/nemo/parakeet-tdt-0.6b-v2/test_onnx.py b/scripts/nemo/parakeet-tdt-0.6b-v2/test_onnx.py new file mode 100755 index 00000000..36ab4740 --- /dev/null +++ b/scripts/nemo/parakeet-tdt-0.6b-v2/test_onnx.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) +import argparse +from pathlib import Path + +import kaldi_native_fbank as knf +import librosa +import numpy as np +import onnxruntime as ort +import soundfile as sf +import torch +import time + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--encoder", type=str, required=True, help="Path to encoder.onnx" + ) + parser.add_argument( + "--decoder", type=str, required=True, help="Path to decoder.onnx" + ) + parser.add_argument("--joiner", type=str, required=True, help="Path to joiner.onnx") + + parser.add_argument("--tokens", type=str, required=True, help="Path to tokens.txt") + + parser.add_argument("--wav", type=str, required=True, help="Path to test.wav") + + return parser.parse_args() + + +def create_fbank(): + opts = knf.FbankOptions() + opts.frame_opts.dither = 0 + opts.frame_opts.remove_dc_offset = False + opts.frame_opts.window_type = "hann" + + opts.mel_opts.low_freq = 0 + opts.mel_opts.num_bins = 128 + + opts.mel_opts.is_librosa = True + + fbank = knf.OnlineFbank(opts) + return fbank + + +def compute_features(audio, fbank): + assert len(audio.shape) == 1, audio.shape + fbank.accept_waveform(16000, audio) + ans = [] + processed = 0 + while processed < fbank.num_frames_ready: + ans.append(np.array(fbank.get_frame(processed))) + processed += 1 + ans = np.stack(ans) + return ans + + +def display(sess, model): + print(f"=========={model} Input==========") + for i in sess.get_inputs(): + print(i) + print(f"=========={model }Output==========") + for i in sess.get_outputs(): + print(i) + + +class OnnxModel: + def __init__( + self, + encoder: str, + decoder: str, + joiner: str, + ): + self.init_encoder(encoder) + display(self.encoder, "encoder") + self.init_decoder(decoder) + display(self.decoder, "decoder") + self.init_joiner(joiner) + display(self.joiner, "joiner") + + def init_encoder(self, encoder): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.encoder = ort.InferenceSession( + encoder, + sess_options=session_opts, + providers=["CPUExecutionProvider"], + ) + + meta = self.encoder.get_modelmeta().custom_metadata_map + self.normalize_type = meta["normalize_type"] + print(meta) + + self.pred_rnn_layers = int(meta["pred_rnn_layers"]) + self.pred_hidden = int(meta["pred_hidden"]) + + def init_decoder(self, decoder): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.decoder = ort.InferenceSession( + decoder, + sess_options=session_opts, + providers=["CPUExecutionProvider"], + ) + + def init_joiner(self, joiner): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.joiner = ort.InferenceSession( + joiner, + sess_options=session_opts, + providers=["CPUExecutionProvider"], + ) + + def get_decoder_state(self): + batch_size = 1 + state0 = torch.zeros(self.pred_rnn_layers, batch_size, self.pred_hidden).numpy() + state1 = torch.zeros(self.pred_rnn_layers, batch_size, self.pred_hidden).numpy() + return state0, state1 + + def run_encoder(self, x: np.ndarray): + # x: (T, C) + x = torch.from_numpy(x) + x = x.t().unsqueeze(0) + # x: [1, C, T] + x_lens = torch.tensor([x.shape[-1]], dtype=torch.int64) + + (encoder_out, out_len) = self.encoder.run( + [ + self.encoder.get_outputs()[0].name, + self.encoder.get_outputs()[1].name, + ], + { + self.encoder.get_inputs()[0].name: x.numpy(), + self.encoder.get_inputs()[1].name: x_lens.numpy(), + }, + ) + # [batch_size, dim, T] + return encoder_out + + def run_decoder( + self, + token: int, + state0: np.ndarray, + state1: np.ndarray, + ): + target = torch.tensor([[token]], dtype=torch.int32).numpy() + target_len = torch.tensor([1], dtype=torch.int32).numpy() + + (decoder_out, decoder_out_length, state0_next, state1_next,) = self.decoder.run( + [ + self.decoder.get_outputs()[0].name, + self.decoder.get_outputs()[1].name, + self.decoder.get_outputs()[2].name, + self.decoder.get_outputs()[3].name, + ], + { + self.decoder.get_inputs()[0].name: target, + self.decoder.get_inputs()[1].name: target_len, + self.decoder.get_inputs()[2].name: state0, + self.decoder.get_inputs()[3].name: state1, + }, + ) + return decoder_out, state0_next, state1_next + + def run_joiner( + self, + encoder_out: np.ndarray, + decoder_out: np.ndarray, + ): + # encoder_out: [batch_size, dim, 1] + # decoder_out: [batch_size, dim, 1] + logit = self.joiner.run( + [ + self.joiner.get_outputs()[0].name, + ], + { + self.joiner.get_inputs()[0].name: encoder_out, + self.joiner.get_inputs()[1].name: decoder_out, + }, + )[0] + # logit: [batch_size, 1, 1, vocab_size] + return logit + + +def main(): + args = get_args() + assert Path(args.encoder).is_file(), args.encoder + assert Path(args.decoder).is_file(), args.decoder + assert Path(args.joiner).is_file(), args.joiner + assert Path(args.tokens).is_file(), args.tokens + assert Path(args.wav).is_file(), args.wav + + print(vars(args)) + + model = OnnxModel(args.encoder, args.decoder, args.joiner) + + id2token = dict() + with open(args.tokens, encoding="utf-8") as f: + for line in f: + t, idx = line.split() + id2token[int(idx)] = t + + start = time.time() + fbank = create_fbank() + audio, sample_rate = sf.read(args.wav, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + if sample_rate != 16000: + audio = librosa.resample( + audio, + orig_sr=sample_rate, + target_sr=16000, + ) + sample_rate = 16000 + + tail_padding = np.zeros(sample_rate * 2) + + audio = np.concatenate([audio, tail_padding]) + + blank = len(id2token) - 1 + ans = [blank] + state0, state1 = model.get_decoder_state() + decoder_out, state0_next, state1_next = model.run_decoder(ans[-1], state0, state1) + + features = compute_features(audio, fbank) + if model.normalize_type != "": + assert model.normalize_type == "per_feature", model.normalize_type + features = torch.from_numpy(features) + mean = features.mean(dim=1, keepdims=True) + stddev = features.std(dim=1, keepdims=True) + 1e-5 + features = (features - mean) / stddev + features = features.numpy() + print(audio.shape) + print("features.shape", features.shape) + + encoder_out = model.run_encoder(features) + # encoder_out:[batch_size, dim, T) + for t in range(encoder_out.shape[2]): + encoder_out_t = encoder_out[:, :, t : t + 1] + logits = model.run_joiner(encoder_out_t, decoder_out) + logits = torch.from_numpy(logits) + logits = logits.squeeze() + idx = torch.argmax(logits, dim=-1).item() + if idx != blank: + ans.append(idx) + state0 = state0_next + state1 = state1_next + decoder_out, state0_next, state1_next = model.run_decoder( + ans[-1], state0, state1 + ) + + end = time.time() + + elapsed_seconds = end - start + audio_duration = audio.shape[0] / 16000 + real_time_factor = elapsed_seconds / audio_duration + + ans = ans[1:] # remove the first blank + tokens = [id2token[i] for i in ans] + underline = "▁" + # underline = b"\xe2\x96\x81".decode() + text = "".join(tokens).replace(underline, " ").strip() + + print(ans) + print(args.wav) + print(text) + print(f"RTF: {real_time_factor}") + + +if __name__ == "__main__": + main()