export sense-voice to onnx (#1144)

2024-07-18 00:18:38 +08:00
parent 4198d9a166
commit 346f419f39
7 changed files with 391 additions and 3 deletions
--- a/.github/workflows/export-melo-tts-to-onnx.yaml
+++ b/.github/workflows/export-melo-tts-to-onnx.yaml
@@ -40,7 +40,7 @@ jobs:
          name: test.wav
          path: scripts/melo-tts/test.wav

-      - name: Publish to huggingface (aishell)
+      - name: Publish to huggingface
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
--- a/.github/workflows/export-sense-voice-to-onnx.yaml
+++ b/.github/workflows/export-sense-voice-to-onnx.yaml
@@ -0,0 +1,116 @@
+name: export-sense-voice-to-onnx
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: export-sense-voice-to-onnx-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  export-sense-voice-to-onnx:
+    if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
+    name: export sense-voice
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Download test_wavs
+        shell: bash
+        run: |
+          sudo apt-get install -y -qq sox libsox-fmt-mp3
+          curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/zh.mp3
+          curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/en.mp3
+          curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/ja.mp3
+          curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/ko.mp3
+          curl -SL -O https://huggingface.co/FunAudioLLM/SenseVoiceSmall/resolve/main/example/yue.mp3
+
+          soxi *.mp3
+
+          sox zh.mp3 -r 16k zh.wav
+          sox en.mp3 -r 16k en.wav
+          sox ja.mp3 -r 16k ja.wav
+          sox ko.mp3 -r 16k ko.wav
+          sox yue.mp3 -r 16k yue.wav
+
+      - name: Run
+        shell: bash
+        run: |
+          cd scripts/sense-voice
+          ./run.sh
+
+      - name: Publish to huggingface
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            rm -rf huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+            export GIT_CLONE_PROTECTION_ACTIVE=false
+
+            git clone https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 huggingface
+            cd huggingface
+            git fetch
+            git pull
+            echo "pwd: $PWD"
+            ls -lh ../scripts/sense-voice
+
+            rm -rf ./
+
+            cp -v ../scripts/sense-voice/*.onnx .
+            cp -v ../scripts/sense-voice/tokens.txt .
+            cp -v ../scripts/sense-voice/README.md .
+            cp -v ../scripts/sense-voice/export-onnx.py .
+
+            mkdir test_wavs
+            cp -v ../*.wav ./test_wavs/
+
+            curl -SL -O https://raw.githubusercontent.com/FunAudioLLM/SenseVoice/main/LICENSE
+
+            git lfs track "*.onnx"
+            git add .
+
+            ls -lh
+
+            git status
+
+            git commit -m "add models"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17 main || true
+
+            cd ..
+
+            rm -rf huggingface/.git*
+            dst=sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
+
+            mv huggingface $dst
+
+            tar cjvf $dst.tar.bz2 $dst
+            rm -rf $dst
+
+      - name: Release
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          file: ./*.tar.bz2
+          overwrite: true
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: asr-models
--- a/scripts/melo-tts/run.sh
+++ b/scripts/melo-tts/run.sh
@@ -2,8 +2,6 @@

 set -ex

-
-
 function install() {
  pip install torch==2.3.1+cpu torchaudio==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html

--- a/scripts/sense-voice/README.md
+++ b/scripts/sense-voice/README.md
@@ -0,0 +1,4 @@
+# Introduction
+
+This directory contains models converted from
+https://github.com/FunAudioLLM/SenseVoice
--- a/scripts/sense-voice/export-onnx.py
+++ b/scripts/sense-voice/export-onnx.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+"""
+We use
+https://hf-mirror.com/yuekai/model_repo_sense_voice_small/blob/main/export_onnx.py
+as a reference while writing this file.
+
+Thanks to https://github.com/yuekaizhang for making the file public.
+"""
+
+import os
+from typing import Any, Dict, Tuple
+
+import onnx
+import torch
+from model import SenseVoiceSmall
+from onnxruntime.quantization import QuantType, quantize_dynamic
+
+
+def add_meta_data(filename: str, meta_data: Dict[str, Any]):
+    """Add meta data to an ONNX model. It is changed in-place.
+
+    Args:
+      filename:
+        Filename of the ONNX model to be changed.
+      meta_data:
+        Key-value pairs.
+    """
+    model = onnx.load(filename)
+    while len(model.metadata_props):
+        model.metadata_props.pop()
+
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = str(value)
+
+    onnx.save(model, filename)
+
+
+def modified_forward(
+    self,
+    x: torch.Tensor,
+    x_length: torch.Tensor,
+    language: torch.Tensor,
+    text_norm: torch.Tensor,
+):
+    """
+    Args:
+      x:
+        A 3-D tensor of shape (N, T, C) with dtype torch.float32
+      x_length:
+        A 1-D tensor of shape (N,) with dtype torch.int32
+      language:
+        A 1-D tensor of shape (N,) with dtype torch.int32
+        See also https://github.com/FunAudioLLM/SenseVoice/blob/a80e676461b24419cf1130a33d4dd2f04053e5cc/model.py#L640
+      text_norm:
+        A 1-D tensor of shape (N,) with dtype torch.int32
+        See also https://github.com/FunAudioLLM/SenseVoice/blob/a80e676461b24419cf1130a33d4dd2f04053e5cc/model.py#L642
+    """
+    language_query = self.embed(language).unsqueeze(1)
+    text_norm_query = self.embed(text_norm).unsqueeze(1)
+
+    event_emo_query = self.embed(torch.LongTensor([[1, 2]])).repeat(x.size(0), 1, 1)
+
+    x = torch.cat((language_query, event_emo_query, text_norm_query, x), dim=1)
+    x_length += 4
+
+    encoder_out, encoder_out_lens = self.encoder(x, x_length)
+    if isinstance(encoder_out, tuple):
+        encoder_out = encoder_out[0]
+
+    ctc_logits = self.ctc.ctc_lo(encoder_out)
+
+    return ctc_logits
+
+
+def load_cmvn(filename) -> Tuple[str, str]:
+    neg_mean = None
+    inv_stddev = None
+
+    with open(filename) as f:
+        for line in f:
+            if not line.startswith("<LearnRateCoef>"):
+                continue
+            t = line.split()[3:-1]
+
+            if neg_mean is None:
+                neg_mean = ",".join(t)
+            else:
+                inv_stddev = ",".join(t)
+
+    return neg_mean, inv_stddev
+
+
+def generate_tokens(params):
+    sp = params["tokenizer"].sp
+    with open("tokens.txt", "w", encoding="utf-8") as f:
+        for i in range(sp.vocab_size()):
+            f.write(f"{sp.id_to_piece(i)} {i}\n")
+
+    os.system("head tokens.txt; tail -n200 tokens.txt")
+
+
+def display_params(params):
+    print("----------params----------")
+    print(params)
+
+    print("----------frontend_conf----------")
+    print(params["frontend_conf"])
+
+    os.system(f"cat {params['frontend_conf']['cmvn_file']}")
+
+    print("----------config----------")
+    print(params["config"])
+
+    os.system(f"cat {params['config']}")
+
+
+def main():
+    model, params = SenseVoiceSmall.from_pretrained(model="iic/SenseVoiceSmall")
+    display_params(params)
+
+    generate_tokens(params)
+
+    model.__class__.forward = modified_forward
+
+    x = torch.randn(2, 100, 560, dtype=torch.float32)
+    x_length = torch.tensor([80, 100], dtype=torch.int32)
+    language = torch.tensor([0, 3], dtype=torch.int32)
+    text_norm = torch.tensor([14, 15], dtype=torch.int32)
+
+    opset_version = 13
+    filename = "model.onnx"
+    torch.onnx.export(
+        model,
+        (x, x_length, language, text_norm),
+        filename,
+        opset_version=opset_version,
+        input_names=["x", "x_length", "language", "text_norm"],
+        output_names=["logits"],
+        dynamic_axes={
+            "x": {0: "N", 1: "T"},
+            "x_length": {0: "N"},
+            "language": {0: "N"},
+            "text_norm": {0: "N"},
+            "logits": {0: "N", 1: "T"},
+        },
+    )
+
+    lfr_window_size = params["frontend_conf"]["lfr_m"]
+    lfr_window_shift = params["frontend_conf"]["lfr_n"]
+
+    neg_mean, inv_stddev = load_cmvn(params["frontend_conf"]["cmvn_file"])
+    vocab_size = params["tokenizer"].sp.vocab_size()
+
+    meta_data = {
+        "lfr_window_size": lfr_window_size,
+        "lfr_window_shift": lfr_window_shift,
+        "neg_mean": neg_mean,
+        "inv_stddev": inv_stddev,
+        "model_type": "sense_voice_ctc",
+        "version": "1",
+        "model_author": "iic",
+        "maintainer": "k2-fsa",
+        "vocab_size": vocab_size,
+        "comment": "iic/SenseVoiceSmall",
+        "lang_auto": model.lid_dict["auto"],
+        "lang_zh": model.lid_dict["zh"],
+        "lang_en": model.lid_dict["en"],
+        "lang_yue": model.lid_dict["yue"],  # cantonese
+        "lang_ja": model.lid_dict["ja"],
+        "lang_ko": model.lid_dict["ko"],
+        "lang_nospeech": model.lid_dict["nospeech"],
+        "with_itn": model.textnorm_dict["withitn"],
+        "without_itn": model.textnorm_dict["woitn"],
+        "url": "https://huggingface.co/FunAudioLLM/SenseVoiceSmall",
+    }
+    add_meta_data(filename=filename, meta_data=meta_data)
+
+    filename_int8 = "model.int8.onnx"
+    quantize_dynamic(
+        model_input=filename,
+        model_output=filename_int8,
+        op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QInt8,
+    )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(20240717)
+    main()
--- a/scripts/sense-voice/run.sh
+++ b/scripts/sense-voice/run.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+set -ex
+
+
+function install() {
+  pip install torch==2.3.1+cpu torchaudio==2.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+  pushd /tmp
+
+  git clone https://github.com/alibaba/FunASR.git
+  cd FunASR
+  pip3 install -qq -e ./
+  cd ..
+
+  git clone https://github.com/FunAudioLLM/SenseVoice
+  cd SenseVoice
+  pip install -qq -r ./requirements.txt
+  cd ..
+
+  pip install soundfile onnx onnxruntime kaldi-native-fbank librosa soundfile
+
+  popd
+}
+
+install
+
+export PYTHONPATH=/tmp/FunASR:$PYTHONPATH
+export PYTHONPATH=/tmp/SenseVoice:$PYTHONPATH
+
+echo "pwd: $PWD"
+
+./export-onnx.py
+
+./show-info.py
+
+ls -lh
--- a/scripts/sense-voice/show-info.py
+++ b/scripts/sense-voice/show-info.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+# Copyright      2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import onnxruntime
+
+
+def show(filename):
+    session_opts = onnxruntime.SessionOptions()
+    session_opts.log_severity_level = 3
+    sess = onnxruntime.InferenceSession(filename, session_opts)
+    for i in sess.get_inputs():
+        print(i)
+
+    print("-----")
+
+    for i in sess.get_outputs():
+        print(i)
+
+    meta = sess.get_modelmeta().custom_metadata_map
+    print("*****************************************")
+    print("meta\n", meta)
+
+
+def main():
+    print("=========model==========")
+    show("./model.onnx")
+
+
+if __name__ == "__main__":
+    main()
+"""
+=========model==========
+NodeArg(name='x', type='tensor(float)', shape=['N', 'T', 560])
+NodeArg(name='x_length', type='tensor(int32)', shape=['N'])
+NodeArg(name='language', type='tensor(int32)', shape=['N'])
+NodeArg(name='text_norm', type='tensor(int32)', shape=['N'])
+-----
+NodeArg(name='logits', type='tensor(float)', shape=['N', 'T', 25055])
+*****************************************
+"""