diff --git a/.github/workflows/export-silero-vad-rknn.yaml b/.github/workflows/export-silero-vad-rknn.yaml new file mode 100644 index 00000000..e0008a9e --- /dev/null +++ b/.github/workflows/export-silero-vad-rknn.yaml @@ -0,0 +1,114 @@ +name: export-silero-vad-to-rknn + +on: + workflow_dispatch: + +concurrency: + group: export-silero-vad-to-rknn-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-silero-vad-to-rknn: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export silero-vad to rknn + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + shell: bash + run: | + python3 -m pip install --upgrade \ + pip \ + "numpy<2" \ + torch==2.0.0+cpu -f https://download.pytorch.org/whl/torch \ + onnx \ + onnxruntime==1.17.1 \ + librosa \ + soundfile \ + onnxsim + + curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.1.0%2B708089d1-cp310-cp310-linux_x86_64.whl + pip install ./*.whl "numpy<=1.26.4" + + - name: Run + shell: bash + run: | + cd scripts/silero_vad/v4 + curl -SL -O https://github.com/snakers4/silero-vad/raw/refs/tags/v4.0/files/silero_vad.jit + ./export-onnx.py + ./show.py + + ls -lh m.onnx + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + ./test-onnx.py --model ./m.onnx --wav ./lei-jun-test.wav + + for platform in rk3588 rk3576 rk3568 rk3566 rk3562; do + echo "Platform: $platform" + ./export-rknn.py --in-model ./m.onnx --out-model silero-vad-v4-$platform.rknn --target-platform $platform + ls -lh silero-vad-v4-$platform.rknn + done + + - name: Collect files + shell: bash + run: | + cd scripts/silero_vad/v4 + ls -lh + mv *.rknn ../../.. + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.rknn + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models + + - name: Upload model to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-rknn-models huggingface + cd huggingface + + git fetch + git pull + git lfs track "*.rknn" + git merge -m "merge remote" --ff origin main + dst=vad + mkdir -p $dst + cp ../*.rknn $dst/ || true + + ls -lh $dst + git add . + git status + git commit -m "update models" + git status + + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-rknn-models main || true + rm -rf huggingface diff --git a/.gitignore b/.gitignore index 1c8e95b7..2fbc184f 100644 --- a/.gitignore +++ b/.gitignore @@ -136,6 +136,7 @@ kokoro-multi-lang-v1_0 sherpa-onnx-fire-red-asr-large-zh_en-2025-02-16 cmake-build-debug README-DEV.txt - +*.rknn +*.jit ##clion -.idea \ No newline at end of file +.idea diff --git a/scripts/silero_vad/v4/README.md b/scripts/silero_vad/v4/README.md new file mode 100644 index 00000000..380dfa70 --- /dev/null +++ b/scripts/silero_vad/v4/README.md @@ -0,0 +1,52 @@ +# Introduction + +This folder contains script for exporting +[silero_vad v4](https://github.com/snakers4/silero-vad/tree/v4.0) +to rknn. + +# Steps to run + +## 1. Download a jit model +You can download it from + +```bash +wget https://github.com/snakers4/silero-vad/raw/refs/tags/v4.0/files/silero_vad.jit +``` + +```bash +ls -lh silero_vad.jit +-rw-r--r-- 1 kuangfangjun root 1.4M Mar 30 11:04 silero_vad.jit +``` + +## 2. Export it to onnx +```bash +./export-onnx.py +``` + +It will generate a file `./m.onnx` + +```bash + ls -lh m.onnx +-rw-r--r-- 1 kuangfangjun root 627K Mar 30 11:13 m.onnx +``` + +## 3. Test the onnx model + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +./test-onnx.py --model ./m.onnx --wav ./lei-jun-test.wav +``` + +## 4. Convert the onnx model to RKNN format + +We assume you have installed rknn toolkit 2.1 +```bash +./export-rknn.py --in-model ./m.onnx --out-model m.rknn --target-platform rk3588 +``` + +It will generate a file `./m.rknn` + +```bash +ls -lh m.rknn +-rw-r--r-- 1 kuangfangjun root 2.2M Mar 30 11:19 m.rknn +``` diff --git a/scripts/silero_vad/v4/export-onnx.py b/scripts/silero_vad/v4/export-onnx.py new file mode 100755 index 00000000..075d711d --- /dev/null +++ b/scripts/silero_vad/v4/export-onnx.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import onnx +import torch +from onnxsim import simplify + + +@torch.no_grad() +def main(): + m = torch.jit.load("./silero_vad.jit") + x = torch.rand((1, 512), dtype=torch.float32) + h = torch.rand((2, 1, 64), dtype=torch.float32) + c = torch.rand((2, 1, 64), dtype=torch.float32) + torch.onnx.export( + m._model, + (x, h, c), + "m.onnx", + input_names=["x", "h", "c"], + output_names=["prob", "next_h", "next_c"], + ) + + print("simplifying ...") + model = onnx.load("m.onnx") + + meta_data = { + "model_type": "silero-vad-v4", + "sample_rate": 16000, + "version": 4, + "h_shape": "2,1,64", + "c_shape": "2,1,64", + } + + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + print("--------------------") + print(model.metadata_props) + + model_simp, check = simplify(model) + onnx.save(model_simp, "m.onnx") + + +if __name__ == "__main__": + main() diff --git a/scripts/silero_vad/v4/export-rknn.py b/scripts/silero_vad/v4/export-rknn.py new file mode 100755 index 00000000..0f0677e5 --- /dev/null +++ b/scripts/silero_vad/v4/export-rknn.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +import argparse +import logging +from pathlib import Path + +from rknn.api import RKNN + +logging.basicConfig(level=logging.WARNING) + +g_platforms = [ + # "rv1103", + # "rv1103b", + # "rv1106", + # "rk2118", + "rk3562", + "rk3566", + "rk3568", + "rk3576", + "rk3588", +] + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--target-platform", + type=str, + required=True, + help=f"Supported values are: {','.join(g_platforms)}", + ) + + parser.add_argument( + "--in-model", + type=str, + required=True, + help="Path to the input onnx model", + ) + + parser.add_argument( + "--out-model", + type=str, + required=True, + help="Path to the output rknn model", + ) + + return parser + + +def get_meta_data(model: str): + import onnxruntime + + session_opts = onnxruntime.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + m = onnxruntime.InferenceSession( + model, + sess_options=session_opts, + providers=["CPUExecutionProvider"], + ) + + for i in m.get_inputs(): + print(i) + + print("-----") + + for i in m.get_outputs(): + print(i) + print() + + meta = m.get_modelmeta().custom_metadata_map + s = "" + sep = "" + for key, value in meta.items(): + s = s + sep + f"{key}={value}" + sep = ";" + assert len(s) < 1024 + + return s + + +def export_rknn(rknn, filename): + ret = rknn.export_rknn(filename) + if ret != 0: + exit("Export rknn model to {filename} failed!") + + +def init_model(filename: str, target_platform: str, custom_string=None): + rknn = RKNN(verbose=False) + + rknn.config( + optimization_level=0, + target_platform=target_platform, + custom_string=custom_string, + ) + if not Path(filename).is_file(): + exit(f"{filename} does not exist") + + ret = rknn.load_onnx(model=filename) + if ret != 0: + exit(f"Load model {filename} failed!") + + ret = rknn.build(do_quantization=False) + if ret != 0: + exit("Build model {filename} failed!") + + return rknn + + +class RKNNModel: + def __init__( + self, + model: str, + target_platform: str, + ): + meta = get_meta_data(model) + print(meta) + + self.model = init_model( + model, + target_platform=target_platform, + custom_string=meta, + ) + + def export_rknn(self, model): + export_rknn(self.model, model) + + def release(self): + self.model.release() + + +def main(): + args = get_parser().parse_args() + print(vars(args)) + + model = RKNNModel( + model=args.in_model, + target_platform=args.target_platform, + ) + + model.export_rknn( + model=args.out_model, + ) + + model.release() + + +if __name__ == "__main__": + main() diff --git a/scripts/silero_vad/v4/show.py b/scripts/silero_vad/v4/show.py new file mode 100755 index 00000000..6a76b98e --- /dev/null +++ b/scripts/silero_vad/v4/show.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +import onnxruntime +import onnx + +""" +[key: "model_type" +value: "silero-vad-v4" +, key: "sample_rate" +value: "16000" +, key: "version" +value: "4" +, key: "h_shape" +value: "2,1,64" +, key: "c_shape" +value: "2,1,64" +] +NodeArg(name='x', type='tensor(float)', shape=[1, 512]) +NodeArg(name='h', type='tensor(float)', shape=[2, 1, 64]) +NodeArg(name='c', type='tensor(float)', shape=[2, 1, 64]) +----- +NodeArg(name='prob', type='tensor(float)', shape=[1, 1]) +NodeArg(name='next_h', type='tensor(float)', shape=[2, 1, 64]) +NodeArg(name='next_c', type='tensor(float)', shape=[2, 1, 64]) +""" + + +def show(filename): + model = onnx.load(filename) + print(model.metadata_props) + + session_opts = onnxruntime.SessionOptions() + session_opts.log_severity_level = 3 + sess = onnxruntime.InferenceSession( + filename, session_opts, providers=["CPUExecutionProvider"] + ) + for i in sess.get_inputs(): + print(i) + + print("-----") + + for i in sess.get_outputs(): + print(i) + + +def main(): + show("./m.onnx") + + +if __name__ == "__main__": + main() diff --git a/scripts/silero_vad/v4/test-onnx.py b/scripts/silero_vad/v4/test-onnx.py new file mode 100755 index 00000000..4df09301 --- /dev/null +++ b/scripts/silero_vad/v4/test-onnx.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) +import onnxruntime as ort +import argparse +import soundfile as sf +from typing import Tuple +import numpy as np + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", + type=str, + required=True, + help="Path to the onnx model", + ) + + parser.add_argument( + "--wav", + type=str, + required=True, + help="Path to the input wav", + ) + return parser.parse_args() + + +class OnnxModel: + def __init__( + self, + model: str, + ): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + self.model = ort.InferenceSession( + model, + sess_options=session_opts, + providers=["CPUExecutionProvider"], + ) + + def get_init_states(self): + h = np.zeros((2, 1, 64), dtype=np.float32) + c = np.zeros((2, 1, 64), dtype=np.float32) + return h, c + + def __call__(self, x, h, c): + """ + Args: + x: (1, 512) + h: (2, 1, 64) + c: (2, 1, 64) + Returns: + prob: (1, 1) + next_h: (2, 1, 64) + next_c: (2, 1, 64) + """ + x = x[None] + out, next_h, next_c = self.model.run( + [ + self.model.get_outputs()[0].name, + self.model.get_outputs()[1].name, + self.model.get_outputs()[2].name, + ], + { + self.model.get_inputs()[0].name: x, + self.model.get_inputs()[1].name: h, + self.model.get_inputs()[2].name: c, + }, + ) + return out, next_h, next_c + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def main(): + args = get_args() + + samples, sample_rate = load_audio(args.wav) + if sample_rate != 16000: + import librosa + + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + model = OnnxModel(args.model) + probs = [] + h, c = model.get_init_states() + window_size = 512 + num_windows = samples.shape[0] // window_size + for i in range(num_windows): + start = i * window_size + end = start + window_size + p, h, c = model(samples[start:end], h, c) + probs.append(p[0].item()) + + threshold = 0.5 + out = np.array(probs) > threshold + out = out.tolist() + min_speech_duration = 0.25 * sample_rate / window_size + min_silence_duration = 0.25 * sample_rate / window_size + + result = [] + last = -1 + for k, f in enumerate(out): + if f >= threshold: + if last == -1: + last = k + elif last != -1: + if k - last > min_speech_duration: + result.append((last, k)) + last = -1 + + if last != -1 and k - last > min_speech_duration: + result.append((last, k)) + + if not result: + print(f"Empty for {args.wav}") + return + + print(result) + + final = [result[0]] + for r in result[1:]: + f = final[-1] + if r[0] - f[1] < min_silence_duration: + final[-1] = (f[0], r[1]) + else: + final.append(r) + + for f in final: + start = f[0] * window_size / sample_rate + end = f[1] * window_size / sample_rate + print("{:.3f} -- {:.3f}".format(start, end)) + + +if __name__ == "__main__": + main()