Export kokoro 1.0 int8 models (#2137)

2025-04-20 14:35:02 +08:00
parent be0f382a54
commit 6cabaa11bf
7 changed files with 212 additions and 99 deletions
--- a/.github/workflows/export-kokoro.yaml
+++ b/.github/workflows/export-kokoro.yaml
@@ -3,7 +3,7 @@ name: export-kokoro-to-onnx
 on:
  push:
    branches:
-      - export-kokoro-2
+      - fix-export-kokoro-1.0-2

  workflow_dispatch:

@@ -111,6 +111,26 @@ jobs:

          ls -lh $d.tar.bz2

+          d=kokoro-int8-multi-lang-v1_0
+          mkdir $d
+          cp -v LICENSE $d/LICENSE
+          cp -a espeak-ng-data $d/
+          cp -v $src/kokoro.int8.onnx $d/model.int8.onnx
+          cp -v $src/voices.bin $d/
+          cp -v $src/tokens.txt $d/
+          cp -v $src/lexicon*.txt $d/
+          cp -v $src/README.md $d/README.md
+          cp -av dict $d/
+          cp -v ./*.fst $d/
+          ls -lh $d/
+          echo "---"
+          ls -lh $d/dict
+
+          tar cjfv $d.tar.bz2 $d
+          rm -rf $d
+
+          ls -lh $d.tar.bz2
+
      - name: Collect results 1.1-zh
        if: matrix.version == '1.1-zh'
        shell: bash
@@ -166,6 +186,25 @@ jobs:
          echo "---"
          ls -lh *.tar.bz2

+      - name: Release
+        if: github.repository_owner == 'csukuangfj'
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          file: ./*.tar.bz2
+          overwrite: true
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: tts-models
+
+      - name: Release
+        if: github.repository_owner == 'k2-fsa'
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          file: ./*.tar.bz2
+          overwrite: true
+          tag: tts-models

      - name: Publish to huggingface 0.19
        if: matrix.version == '0.19'
@@ -216,7 +255,7 @@ jobs:
            git commit -m "add models"
            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true

-      - name: Publish to huggingface 1.0
+      - name: Publish to huggingface 1.0 float32
        if: matrix.version == '1.0'
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -267,6 +306,69 @@ jobs:
            git commit -m "add models"
            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true

+      - name: Publish to huggingface 1.0 int8
+        if: matrix.version == '1.0'
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            rm -rf huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+            export GIT_CLONE_PROTECTION_ACTIVE=false
+
+            git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_0 huggingface
+            cd huggingface
+            rm -rf ./*
+            git fetch
+            git pull
+
+            git lfs track "cmn_dict"
+            git lfs track "ru_dict"
+            git lfs track "af_dict"
+            git lfs track "ar_dict"
+            git lfs track "da_dict"
+            git lfs track "en_dict"
+            git lfs track "fa_dict"
+            git lfs track "hu_dict"
+            git lfs track "ia_dict"
+            git lfs track "it_dict"
+            git lfs track "lb_dict"
+            git lfs track "phondata"
+            git lfs track "ta_dict"
+            git lfs track "ur_dict"
+            git lfs track "yue_dict"
+            git lfs track "*.wav"
+            git lfs track "lexicon*.txt"
+
+            cp -a ../espeak-ng-data ./
+
+            cp -v ../scripts/kokoro/v1.0/kokoro.int8.onnx ./model.int8.onnx
+
+            cp -v ../scripts/kokoro/v1.0/tokens.txt .
+            cp -v ../scripts/kokoro/v1.0/voices.bin .
+            cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
+            cp -v ../scripts/kokoro/v1.0/README.md ./README.md
+            cp -v ../LICENSE ./
+            cp -av ../dict ./
+            cp -v ../*.fst ./
+
+            git lfs track "*.onnx"
+            git add .
+
+            ls -lh
+
+            git status
+
+            git commit -m "add models"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_0 main || true
+
      - name: Publish to huggingface 1.1-zh
        if: matrix.version == '1.1-zh'
        env:
@@ -299,7 +401,6 @@ jobs:

            cp -v ../scripts/kokoro/v1.1-zh/kokoro.onnx ./model.onnx

-
            cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
            cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
            cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
@@ -350,7 +451,6 @@ jobs:

            cp -v ../scripts/kokoro/v1.1-zh/kokoro.int8.onnx ./model.int8.onnx

-
            cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
            cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
            cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
@@ -368,23 +468,3 @@ jobs:

            git commit -m "add models"
            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 main || true
-
-      - name: Release
-        if: github.repository_owner == 'csukuangfj'
-        uses: svenstaro/upload-release-action@v2
-        with:
-          file_glob: true
-          file: ./*.tar.bz2
-          overwrite: true
-          repo_name: k2-fsa/sherpa-onnx
-          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
-          tag: tts-models
-
-      - name: Release
-        if: github.repository_owner == 'k2-fsa'
-        uses: svenstaro/upload-release-action@v2
-        with:
-          file_glob: true
-          file: ./*.tar.bz2
-          overwrite: true
-          tag: tts-models
--- a/scripts/kokoro/v1.0/add_meta_data.py
+++ b/scripts/kokoro/v1.0/add_meta_data.py
@@ -10,7 +10,9 @@ from generate_voices_bin import speaker2id

 def main():
    model = onnx.load("./kokoro.onnx")
-    style = torch.load("./voices/af_alloy.pt", weights_only=True, map_location="cpu")
+    style = torch.load(
+        "./Kokoro-82M/voices/af_alloy.pt", weights_only=True, map_location="cpu"
+    )

    id2speaker_str = ""
    speaker2id_str = ""
--- a/scripts/kokoro/v1.0/dynamic_quantization.py
+++ b/scripts/kokoro/v1.0/dynamic_quantization.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+import argparse
+
+import onnxruntime
+from onnxruntime.quantization import QuantType, quantize_dynamic
+
+
+def show(filename):
+    session_opts = onnxruntime.SessionOptions()
+    session_opts.log_severity_level = 3
+    sess = onnxruntime.InferenceSession(filename, session_opts)
+    for i in sess.get_inputs():
+        print(i)
+
+    print("-----")
+
+    for i in sess.get_outputs():
+        print(i)
+
+
+"""
+NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
+NodeArg(name='style', type='tensor(float)', shape=[1, 256])
+NodeArg(name='speed', type='tensor(float)', shape=[1])
+-----
+NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
+"""
+
+
+def main():
+    show("./kokoro.onnx")
+
+    quantize_dynamic(
+        model_input="kokoro.onnx",
+        model_output="kokoro.int8.onnx",
+        #  op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QUInt8,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.0/export_onnx.py
+++ b/scripts/kokoro/v1.0/export_onnx.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+import json
+
+import torch
+from kokoro import KModel
+from kokoro.model import KModelForONNX
+
+
+@torch.no_grad()
+def main():
+    with open("Kokoro-82M/config.json") as f:
+        config = json.load(f)
+
+    model = (
+        KModel(
+            repo_id="not-used-any-value-is-ok",
+            model="Kokoro-82M/kokoro-v1_0.pth",
+            config=config,
+            disable_complex=True,
+        )
+        .to("cpu")
+        .eval()
+    )
+
+    x = torch.randint(1, 100, (48,)).numpy()
+    x = torch.LongTensor([[0, *x, 0]])
+
+    style = torch.rand(1, 256, dtype=torch.float32)
+    speed = torch.rand(1)
+
+    print(x.shape, x.dtype)
+    print(style.shape, style.dtype)
+    print(speed, speed.dtype)
+
+    model2 = KModelForONNX(model)
+
+    torch.onnx.export(
+        model2,
+        (x, style, speed),
+        "kokoro.onnx",
+        input_names=["tokens", "style", "speed"],
+        output_names=["audio"],
+        dynamic_axes={
+            "tokens": {1: "sequence_length"},
+            "audio": {0: "audio_length"},
+        },
+        opset_version=14,  # minimum working version for this kokoro model is 14
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.0/generate_tokens.py
+++ b/scripts/kokoro/v1.0/generate_tokens.py
@@ -6,7 +6,7 @@ import json


 def main():
-    with open("config.json") as f:
+    with open("Kokoro-82M/config.json") as f:
        config = json.load(f)
    vocab = config["vocab"]

--- a/scripts/kokoro/v1.0/generate_voices_bin.py
+++ b/scripts/kokoro/v1.0/generate_voices_bin.py
@@ -71,7 +71,7 @@ def main():
    with open("voices.bin", "wb") as f:
        for _, speaker in id2speaker.items():
            m = torch.load(
-                f"voices/{speaker}.pt",
+                f"Kokoro-82M/voices/{speaker}.pt",
                weights_only=True,
                map_location="cpu",
            ).numpy()
--- a/scripts/kokoro/v1.0/run.sh
+++ b/scripts/kokoro/v1.0/run.sh
@@ -3,93 +3,29 @@

 set -ex

-if [ ! -f kokoro.onnx ]; then
-  # see https://github.com/taylorchu/kokoro-onnx/releases
-  curl -SL -O https://github.com/taylorchu/kokoro-onnx/releases/download/v0.2.0/kokoro.onnx
-fi
+git clone https://huggingface.co/hexgrad/Kokoro-82M

-if [ ! -f config.json ]; then
-  # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
-  curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/config.json
-fi
-
-# see https://huggingface.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L83
-# and
 # https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
 #
 # af -> American female
 # am -> American male
 # bf -> British female
 # bm -> British male
-voices=(
-af_alloy
-af_aoede
-af_bella
-af_heart
-af_jessica
-af_kore
-af_nicole
-af_nova
-af_river
-af_sarah
-af_sky
-am_adam
-am_echo
-am_eric
-am_fenrir
-am_liam
-am_michael
-am_onyx
-am_puck
-am_santa
-bf_alice
-bf_emma
-bf_isabella
-bf_lily
-bm_daniel
-bm_fable
-bm_george
-bm_lewis
-ef_dora
-em_alex
-ff_siwis
-hf_alpha
-hf_beta
-hm_omega
-hm_psi
-if_sara
-im_nicola
-jf_alpha
-jf_gongitsune
-jf_nezumi
-jf_tebukuro
-jm_kumo
-pf_dora
-pm_alex
-pm_santa
-zf_xiaobei # 东北话
-zf_xiaoni
-zf_xiaoxiao
-zf_xiaoyi
-zm_yunjian
-zm_yunxi
-zm_yunxia
-zm_yunyang
-)

-mkdir -p voices
+if [ ! -f ./kokoro.onnx ]; then
+  python3 ./export_onnx.py
+fi

-for v in ${voices[@]}; do
-  if [ ! -f voices/$v.pt ]; then
-    curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/voices/$v.pt
-  fi
-done

 if [ ! -f ./.add-meta-data.done ]; then
  python3 ./add_meta_data.py
  touch ./.add-meta-data.done
 fi

+if [ ! -f ./kokoro.int8.onnx ]; then
+  python3 ./dynamic_quantization.py
+fi
+
 if [ ! -f us_gold.json ]; then
  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
 fi