diff --git a/.github/workflows/export-melo-tts-to-onnx.yaml b/.github/workflows/export-melo-tts-to-onnx.yaml index 4e561688..fa82835f 100644 --- a/.github/workflows/export-melo-tts-to-onnx.yaml +++ b/.github/workflows/export-melo-tts-to-onnx.yaml @@ -40,7 +40,7 @@ jobs: name: test.wav path: scripts/melo-tts/test.wav - - name: Publish to huggingface (aishell) + - name: Publish to huggingface env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 diff --git a/.github/workflows/export-sense-voice-to-onnx.yaml b/.github/workflows/export-sense-voice-to-onnx.yaml new file mode 100644 index 00000000..8303dec4 --- /dev/null +++ b/.github/workflows/export-sense-voice-to-onnx.yaml @@ -0,0 +1,116 @@ +name: export-sense-voice-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-sense-voice-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-sense-voice-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export sense-voice + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Download test_wavs + shell: bash + run: | + sudo apt-get install -y -qq sox libsox-fmt-mp3 + curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/zh.mp3 + curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/en.mp3 + curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/ja.mp3 + curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/ko.mp3 + curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/yue.mp3 + + soxi *.mp3 + + sox zh.mp3 -r 16k zh.wav + sox en.mp3 -r 16k en.wav + sox ja.mp3 -r 16k ja.wav + sox ko.mp3 -r 16k ko.wav + sox yue.mp3 -r 16k yue.wav + + - name: Run + shell: bash + run: | + cd scripts/sense-voice + ./run.sh + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 huggingface + cd huggingface + git fetch + git pull + echo "pwd: $PWD" + ls -lh ../scripts/sense-voice + + rm -rf ./ + + cp -v ../scripts/sense-voice/*.onnx . + cp -v ../scripts/sense-voice/tokens.txt . + cp -v ../scripts/sense-voice/README.md . + cp -v ../scripts/sense-voice/export-onnx.py . + + mkdir test_wavs + cp -v ../*.wav ./test_wavs/ + + curl -SL -O https://raw.githubusercontent.com/FunAudioLLM/SenseVoice/main/LICENSE + + git lfs track "*.onnx" + git add . + + ls -lh + + git status + + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 main || true + + cd .. + + rm -rf huggingface/.git* + dst=sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 + + mv huggingface $dst + + tar cjvf $dst.tar.bz2 $dst + rm -rf $dst + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models diff --git a/scripts/melo-tts/run.sh b/scripts/melo-tts/run.sh index eea3de89..b8fee07e 100755 --- a/scripts/melo-tts/run.sh +++ b/scripts/melo-tts/run.sh @@ -2,8 +2,6 @@ set -ex - - function install() { pip install torch==2.3.1+cpu torchaudio==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html diff --git a/scripts/sense-voice/README.md b/scripts/sense-voice/README.md new file mode 100644 index 00000000..a05f82e9 --- /dev/null +++ b/scripts/sense-voice/README.md @@ -0,0 +1,4 @@ +# Introduction + +This directory contains models converted from +https://github.com/FunAudioLLM/SenseVoice diff --git a/scripts/sense-voice/export-onnx.py b/scripts/sense-voice/export-onnx.py new file mode 100755 index 00000000..838b8265 --- /dev/null +++ b/scripts/sense-voice/export-onnx.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +""" +We use +https://hf-mirror.com/yuekai/model_repo_sense_voice_small/blob/main/export_onnx.py +as a reference while writing this file. + +Thanks to https://github.com/yuekaizhang for making the file public. +""" + +import os +from typing import Any, Dict, Tuple + +import onnx +import torch +from model import SenseVoiceSmall +from onnxruntime.quantization import QuantType, quantize_dynamic + + +def add_meta_data(filename: str, meta_data: Dict[str, Any]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + + onnx.save(model, filename) + + +def modified_forward( + self, + x: torch.Tensor, + x_length: torch.Tensor, + language: torch.Tensor, + text_norm: torch.Tensor, +): + """ + Args: + x: + A 3-D tensor of shape (N, T, C) with dtype torch.float32 + x_length: + A 1-D tensor of shape (N,) with dtype torch.int32 + language: + A 1-D tensor of shape (N,) with dtype torch.int32 + See also https://github.com/FunAudioLLM/SenseVoice/blob/a80e676461b24419cf1130a33d4dd2f04053e5cc/model.py#L640 + text_norm: + A 1-D tensor of shape (N,) with dtype torch.int32 + See also https://github.com/FunAudioLLM/SenseVoice/blob/a80e676461b24419cf1130a33d4dd2f04053e5cc/model.py#L642 + """ + language_query = self.embed(language).unsqueeze(1) + text_norm_query = self.embed(text_norm).unsqueeze(1) + + event_emo_query = self.embed(torch.LongTensor([[1, 2]])).repeat(x.size(0), 1, 1) + + x = torch.cat((language_query, event_emo_query, text_norm_query, x), dim=1) + x_length += 4 + + encoder_out, encoder_out_lens = self.encoder(x, x_length) + if isinstance(encoder_out, tuple): + encoder_out = encoder_out[0] + + ctc_logits = self.ctc.ctc_lo(encoder_out) + + return ctc_logits + + +def load_cmvn(filename) -> Tuple[str, str]: + neg_mean = None + inv_stddev = None + + with open(filename) as f: + for line in f: + if not line.startswith(""): + continue + t = line.split()[3:-1] + + if neg_mean is None: + neg_mean = ",".join(t) + else: + inv_stddev = ",".join(t) + + return neg_mean, inv_stddev + + +def generate_tokens(params): + sp = params["tokenizer"].sp + with open("tokens.txt", "w", encoding="utf-8") as f: + for i in range(sp.vocab_size()): + f.write(f"{sp.id_to_piece(i)} {i}\n") + + os.system("head tokens.txt; tail -n200 tokens.txt") + + +def display_params(params): + print("----------params----------") + print(params) + + print("----------frontend_conf----------") + print(params["frontend_conf"]) + + os.system(f"cat {params['frontend_conf']['cmvn_file']}") + + print("----------config----------") + print(params["config"]) + + os.system(f"cat {params['config']}") + + +def main(): + model, params = SenseVoiceSmall.from_pretrained(model="iic/SenseVoiceSmall") + display_params(params) + + generate_tokens(params) + + model.__class__.forward = modified_forward + + x = torch.randn(2, 100, 560, dtype=torch.float32) + x_length = torch.tensor([80, 100], dtype=torch.int32) + language = torch.tensor([0, 3], dtype=torch.int32) + text_norm = torch.tensor([14, 15], dtype=torch.int32) + + opset_version = 13 + filename = "model.onnx" + torch.onnx.export( + model, + (x, x_length, language, text_norm), + filename, + opset_version=opset_version, + input_names=["x", "x_length", "language", "text_norm"], + output_names=["logits"], + dynamic_axes={ + "x": {0: "N", 1: "T"}, + "x_length": {0: "N"}, + "language": {0: "N"}, + "text_norm": {0: "N"}, + "logits": {0: "N", 1: "T"}, + }, + ) + + lfr_window_size = params["frontend_conf"]["lfr_m"] + lfr_window_shift = params["frontend_conf"]["lfr_n"] + + neg_mean, inv_stddev = load_cmvn(params["frontend_conf"]["cmvn_file"]) + vocab_size = params["tokenizer"].sp.vocab_size() + + meta_data = { + "lfr_window_size": lfr_window_size, + "lfr_window_shift": lfr_window_shift, + "neg_mean": neg_mean, + "inv_stddev": inv_stddev, + "model_type": "sense_voice_ctc", + "version": "1", + "model_author": "iic", + "maintainer": "k2-fsa", + "vocab_size": vocab_size, + "comment": "iic/SenseVoiceSmall", + "lang_auto": model.lid_dict["auto"], + "lang_zh": model.lid_dict["zh"], + "lang_en": model.lid_dict["en"], + "lang_yue": model.lid_dict["yue"], # cantonese + "lang_ja": model.lid_dict["ja"], + "lang_ko": model.lid_dict["ko"], + "lang_nospeech": model.lid_dict["nospeech"], + "with_itn": model.textnorm_dict["withitn"], + "without_itn": model.textnorm_dict["woitn"], + "url": "https://huggingface.co/FunAudioLLM/SenseVoiceSmall", + } + add_meta_data(filename=filename, meta_data=meta_data) + + filename_int8 = "model.int8.onnx" + quantize_dynamic( + model_input=filename, + model_output=filename_int8, + op_types_to_quantize=["MatMul"], + weight_type=QuantType.QInt8, + ) + + +if __name__ == "__main__": + torch.manual_seed(20240717) + main() diff --git a/scripts/sense-voice/run.sh b/scripts/sense-voice/run.sh new file mode 100755 index 00000000..11d5b8bd --- /dev/null +++ b/scripts/sense-voice/run.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -ex + + +function install() { + pip install torch==2.3.1+cpu torchaudio==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html + + pushd /tmp + + git clone https://github.com/alibaba/FunASR.git + cd FunASR + pip3 install -qq -e ./ + cd .. + + git clone https://github.com/FunAudioLLM/SenseVoice + cd SenseVoice + pip install -qq -r ./requirements.txt + cd .. + + pip install soundfile onnx onnxruntime kaldi-native-fbank librosa soundfile + + popd +} + +install + +export PYTHONPATH=/tmp/FunASR:$PYTHONPATH +export PYTHONPATH=/tmp/SenseVoice:$PYTHONPATH + +echo "pwd: $PWD" + +./export-onnx.py + +./show-info.py + +ls -lh diff --git a/scripts/sense-voice/show-info.py b/scripts/sense-voice/show-info.py new file mode 100755 index 00000000..1cf433fe --- /dev/null +++ b/scripts/sense-voice/show-info.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +import onnxruntime + + +def show(filename): + session_opts = onnxruntime.SessionOptions() + session_opts.log_severity_level = 3 + sess = onnxruntime.InferenceSession(filename, session_opts) + for i in sess.get_inputs(): + print(i) + + print("-----") + + for i in sess.get_outputs(): + print(i) + + meta = sess.get_modelmeta().custom_metadata_map + print("*****************************************") + print("meta\n", meta) + + +def main(): + print("=========model==========") + show("./model.onnx") + + +if __name__ == "__main__": + main() +""" +=========model========== +NodeArg(name='x', type='tensor(float)', shape=['N', 'T', 560]) +NodeArg(name='x_length', type='tensor(int32)', shape=['N']) +NodeArg(name='language', type='tensor(int32)', shape=['N']) +NodeArg(name='text_norm', type='tensor(int32)', shape=['N']) +----- +NodeArg(name='logits', type='tensor(float)', shape=['N', 'T', 25055]) +***************************************** +"""