Export Kokoro 1.0 to sherpa-onnx (#1788)

2025-02-05 08:24:43 +08:00
parent 8677d83efc
commit 08cefe8488
13 changed files with 706 additions and 14 deletions
--- a/.github/workflows/export-kokoro.yaml
+++ b/.github/workflows/export-kokoro.yaml
@@ -4,6 +4,7 @@ on:
  push:
    branches:
      - export-kokoro
      - kokoro-1.0-2
  workflow_dispatch:
@@ -14,12 +15,13 @@ concurrency:
 jobs:
  export-kokoro-to-onnx:
    if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj'
-    name: export kokoro
+    name: export kokoro ${{ matrix.version }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
        version: ["0.19", "1.0"]
        python-version: ["3.10"]
    steps:
@@ -33,7 +35,7 @@ jobs:
      - name: Install Python dependencies
        shell: bash
        run: |
-          pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+          pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
      - name: Run
        shell: bash
@@ -42,9 +44,16 @@ jobs:
          tar xf espeak-ng-data.tar.bz2
          rm espeak-ng-data.tar.bz2
          cd scripts/kokoro
-          ./run.sh
+          v=${{ matrix.version }}
          if [[ $v = "0.19" ]]; then
            ./run.sh
          elif [[ $v == "1.0" ]]; then
            cd v1.0
            ./run.sh
          fi
-      - name: Collect results
+      - name: Collect results ${{ matrix.version }}
        if: matrix.version == '0.19'
        shell: bash
        run: |
          src=scripts/kokoro
@@ -53,7 +62,7 @@ jobs:
          mkdir $d
          cp -a LICENSE $d/LICENSE
          cp -a espeak-ng-data $d/
-          cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx
+          cp -v $src/kokoro-v0_19.onnx $d/model.onnx
          cp -v $src/voices.bin $d/
          cp -v $src/tokens.txt $d/
          cp -v $src/README-new.md $d/README.md
@@ -61,9 +70,31 @@ jobs:
          tar cjfv $d.tar.bz2 $d
          rm -rf $d
-          ls -h $.tar.bz2
+          ls -lh $d.tar.bz2
-      - name: Publish to huggingface
+      - name: Collect results ${{ matrix.version }}
        if: matrix.version == '1.0'
        shell: bash
        run: |
          src=scripts/kokoro/v1.0
          d=kokoro-multi-lang-v1_0
          mkdir $d
          cp -a LICENSE $d/LICENSE
          cp -a espeak-ng-data $d/
          cp -v $src/kokoro.onnx $d/model.onnx
          cp -v $src/voices.bin $d/
          cp -v $src/tokens.txt $d/
          cp -v $src/lexicon*.txt $d/
          cp -v $src/README.md $d/README.md
          ls -lh $d/
          tar cjfv $d.tar.bz2 $d
          rm -rf $d
          ls -lh $d.tar.bz2
      - name: Publish to huggingface ${{ matrix.version }}
        if: matrix.version == '0.19'
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
@@ -92,9 +123,9 @@ jobs:
            cp -a ../espeak-ng-data ./
            mkdir -p test_wavs
-            cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx
+            cp -v ../scripts/kokoro/kokoro-v0_19.onnx ./model.onnx
-            cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/
+            cp -v ../scripts/kokoro/kokoro-v0_19-*.wav ./test_wavs/
            cp -v ../scripts/kokoro/tokens.txt .
            cp -v ../scripts/kokoro/voices.bin .
@@ -111,6 +142,55 @@ jobs:
            git commit -m "add models"
            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
      - name: Publish to huggingface ${{ matrix.version }}
        if: matrix.version == '1.0'
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
        with:
          max_attempts: 20
          timeout_seconds: 200
          shell: bash
          command: |
            git config --global user.email "csukuangfj@gmail.com"
            git config --global user.name "Fangjun Kuang"
            rm -rf huggingface
            export GIT_LFS_SKIP_SMUDGE=1
            export GIT_CLONE_PROTECTION_ACTIVE=false
            git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 huggingface
            cd huggingface
            rm -rf ./*
            git fetch
            git pull
            git lfs track "cmn_dict"
            git lfs track "ru_dict"
            git lfs track "*.wav"
            git lfs track "lexicon*.txt"
            cp -a ../espeak-ng-data ./
            cp -v ../scripts/kokoro/v1.0/kokoro.onnx ./model.onnx
            cp -v ../scripts/kokoro/v1.0/tokens.txt .
            cp -v ../scripts/kokoro/v1.0/voices.bin .
            cp -v ../scripts/kokoro/v1.0/lexicon*.txt .
            cp -v ../scripts/kokoro/v1.0/README.md ./README.md
            cp -v ../LICENSE ./
            git lfs track "*.onnx"
            git add .
            ls -lh
            git status
            git commit -m "add models"
            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true
      - name: Release
        uses: svenstaro/upload-release-action@v2
        with:
--- a/.gitignore
+++ b/.gitignore
@@ -128,3 +128,7 @@ harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md
 matcha-icefall-zh-baker
 matcha-icefall-en_US-ljspeech
 kokoro-en-v0_19
 *.pt
 lexicon.txt
 us_gold.json
 us_silver.json
--- a/scripts/kokoro/add_meta_data.py
+++ b/scripts/kokoro/add_meta_data.py
@@ -69,6 +69,14 @@ def main():
            for k in keys:
                f.write(voices[k].tobytes())
    speaker2id_str = ""
    id2speaker_str = ""
    sep = ""
    for i, s in enumerate(keys):
        speaker2id_str += f"{sep}{s}->{i}"
        id2speaker_str += f"{sep}{i}->{s}"
        sep = ","
    meta_data = {
        "model_type": "kokoro",
        "language": "English",
@@ -78,6 +86,8 @@ def main():
        "voice": "en-us",
        "style_dim": ",".join(map(str, voices[keys[0]].shape)),
        "n_speakers": len(keys),
        "speaker2id": speaker2id_str,
        "id2speaker": id2speaker_str,
        "speaker_names": ",".join(keys),
        "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
        "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
--- a/scripts/kokoro/run.sh
+++ b/scripts/kokoro/run.sh
@@ -16,8 +16,8 @@ https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
 EOF
 files=(
-kokoro-v0_19_hf.onnx
+# kokoro-v0_19_hf.onnx
-# kokoro-v0_19.onnx
+kokoro-v0_19.onnx
 # kokoro-quant.onnx
 # kokoro-quant-convinteger.onnx
 voices.json
@@ -30,14 +30,14 @@ for f in ${files[@]}; do
 done
 models=(
-# kokoro-v0_19
+kokoro-v0_19
 # kokoro-quant
 # kokoro-quant-convinteger
-kokoro-v0_19_hf
+# kokoro-v0_19_hf
 )
 for m in ${models[@]}; do
-  ./add-meta-data.py --model $m.onnx --voices ./voices.json
+  ./add_meta_data.py --model $m.onnx --voices ./voices.json
 done
 ls -l
--- a/scripts/kokoro/v1.0/.gitignore
+++ b/scripts/kokoro/v1.0/.gitignore
@@ -0,0 +1,5 @@
 config.json
 *.json
 *.txt
 .add-meta-data.done
 voices
--- a/scripts/kokoro/v1.0/README.md
+++ b/scripts/kokoro/v1.0/README.md
@@ -0,0 +1,3 @@
 # Introduction
 This directory is for kokoro v1.0
--- a/scripts/kokoro/v1.0/init.py
+++ b/scripts/kokoro/v1.0/init.py
--- a/scripts/kokoro/v1.0/add_meta_data.py
+++ b/scripts/kokoro/v1.0/add_meta_data.py
@@ -0,0 +1,64 @@
 #!/usr/bin/env python3
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
 import argparse
 import json
 from pathlib import Path
 import numpy as np
 import onnx
 import torch
 from generate_voices_bin import speaker2id
 def main():
    model = onnx.load("./kokoro.onnx")
    style = torch.load("./voices/af_alloy.pt", weights_only=True, map_location="cpu")
    id2speaker_str = ""
    speaker2id_str = ""
    sep = ""
    for s, i in speaker2id.items():
        speaker2id_str += f"{sep}{s}->{i}"
        id2speaker_str += f"{sep}{i}->{s}"
        sep = ","
    meta_data = {
        "model_type": "kokoro",
        "language": "English",
        "has_espeak": 1,
        "sample_rate": 24000,
        "version": 2,
        "voice": "en-us",
        "style_dim": ",".join(map(str, style.shape)),
        "n_speakers": len(speaker2id),
        "id2speaker": id2speaker_str,
        "speaker2id": speaker2id_str,
        "speaker_names": ",".join(map(str, speaker2id.keys())),
        "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
        "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
        "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
        "maintainer": "k2-fsa",
        "comment": "This is Kokoro v1.0, a multilingual TTS model, supporting English, Chinese, French, Japanese etc.",
    }
    print(model.metadata_props)
    while len(model.metadata_props):
        model.metadata_props.pop()
    for key, value in meta_data.items():
        meta = model.metadata_props.add()
        meta.key = key
        meta.value = str(value)
    print("--------------------")
    print(model.metadata_props)
    onnx.save(model, "./kokoro.onnx")
 if __name__ == "__main__":
    main()
--- a/scripts/kokoro/v1.0/generate_lexicon.py
+++ b/scripts/kokoro/v1.0/generate_lexicon.py
@@ -0,0 +1,89 @@
 #!/usr/bin/env python3
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
 import json
 from pypinyin import phrases_dict, pinyin_dict
 from misaki import zh
 from typing import List, Tuple
 def generate_english_lexicon(kind: str):
    assert kind in ("us", "gb"), kind
    # If you want to add new words, please add them to
    # the user_defined dict.
    user_defined = {
        "Kokoro": "kˈOkəɹO",
        "Misaki": "misˈɑki",
    }
    user_defined_lower = dict()
    for k, v in user_defined.items():
        user_defined_lower[k.lower()] = v
    with open(f"./{kind}_gold.json", encoding="utf-8") as f:
        gold = json.load(f)
    with open(f"./{kind}_silver.json", encoding="utf-8") as f:
        silver = json.load(f)
    # words in us_gold has a higher priority than those in s_silver, so
    # we put us_gold after us_silver below
    english = {**silver, **gold}
    lexicon = dict()
    for k, v in english.items():
        k_lower = k.lower()
        if k_lower in user_defined_lower:
            print(f"{k} already exist in the user defined dict. Skip adding")
            continue
        if isinstance(v, str):
            lexicon[k_lower] = v
        else:
            assert isinstance(v, dict), (k, v)
            assert "DEFAULT" in v, (k, v)
            lexicon[k_lower] = v["DEFAULT"]
    return list(user_defined_lower.items()) + list(lexicon.items())
 def generate_chinese_lexicon():
    word_dict = pinyin_dict.pinyin_dict
    phrases = phrases_dict.phrases_dict
    g2p = zh.ZHG2P()
    lexicon = []
    for key in word_dict:
        if not (0x4E00 <= key <= 0x9FFF):
            continue
        w = chr(key)
        tokens: str = g2p(w)
        lexicon.append((w, tokens))
    for key in phrases:
        tokens: str = g2p(key)
        lexicon.append((key, tokens))
    return lexicon
 def save(filename: str, lexicon: List[Tuple[str, str]]):
    with open(filename, "w", encoding="utf-8") as f:
        for word, phones in lexicon:
            tokens = " ".join(list(phones))
            f.write(f"{word} {tokens}\n")
 def main():
    us = generate_english_lexicon("us")
    gb = generate_english_lexicon("gb")
    zh = generate_chinese_lexicon()
    save("lexicon-us-en.txt", us)
    save("lexicon-gb-en.txt", gb)
    save("lexicon-zh.txt", zh)
 if __name__ == "__main__":
    main()
--- a/scripts/kokoro/v1.0/generate_tokens.py
+++ b/scripts/kokoro/v1.0/generate_tokens.py
@@ -0,0 +1,19 @@
 #!/usr/bin/env python3
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
 import json
 def main():
    with open("config.json") as f:
        config = json.load(f)
    vocab = config["vocab"]
    with open("tokens.txt", "w", encoding="utf-8") as f:
        for k, i in vocab.items():
            f.write(f"{k} {i}\n")
 if __name__ == "__main__":
    main()
--- a/scripts/kokoro/v1.0/generate_voices_bin.py
+++ b/scripts/kokoro/v1.0/generate_voices_bin.py
@@ -0,0 +1,84 @@
 #!/usr/bin/env python3
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
 import torch
 from pathlib import Path
 id2speaker = {
    0: "af_alloy",
    1: "af_aoede",
    2: "af_bella",
    3: "af_heart",
    4: "af_jessica",
    5: "af_kore",
    6: "af_nicole",
    7: "af_nova",
    8: "af_river",
    9: "af_sarah",
    10: "af_sky",
    11: "am_adam",
    12: "am_echo",
    13: "am_eric",
    14: "am_fenrir",
    15: "am_liam",
    16: "am_michael",
    17: "am_onyx",
    18: "am_puck",
    19: "am_santa",
    20: "bf_alice",
    21: "bf_emma",
    22: "bf_isabella",
    23: "bf_lily",
    24: "bm_daniel",
    25: "bm_fable",
    26: "bm_george",
    27: "bm_lewis",
    28: "ef_dora",
    29: "em_alex",
    30: "ff_siwis",
    31: "hf_alpha",
    32: "hf_beta",
    33: "hm_omega",
    34: "hm_psi",
    35: "if_sara",
    36: "im_nicola",
    37: "jf_alpha",
    38: "jf_gongitsune",
    39: "jf_nezumi",
    40: "jf_tebukuro",
    41: "jm_kumo",
    42: "pf_dora",
    43: "pm_alex",
    44: "pm_santa",
    45: "zf_xiaobei",
    46: "zf_xiaoni",
    47: "zf_xiaoxiao",
    48: "zf_xiaoyi",
    49: "zm_yunjian",
    50: "zm_yunxi",
    51: "zm_yunxia",
    52: "zm_yunyang",
 }
 speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
 def main():
    if Path("./voices.bin").is_file():
        print("./voices.bin exists - skip")
        return
    with open("voices.bin", "wb") as f:
        for _, speaker in id2speaker.items():
            m = torch.load(
                f"{speaker}.pt",
                weights_only=True,
                map_location="cpu",
            ).numpy()
            # m.shape (510, 1, 256)
            f.write(m.tobytes())
 if __name__ == "__main__":
    main()
--- a/scripts/kokoro/v1.0/run.sh
+++ b/scripts/kokoro/v1.0/run.sh
@@ -0,0 +1,122 @@
 #!/usr/bin/env bash
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
 set -ex
 if [ ! -f kokoro.onnx ]; then
  # see https://github.com/taylorchu/kokoro-onnx/releases
  curl -SL -O https://github.com/taylorchu/kokoro-onnx/releases/download/v0.2.0/kokoro.onnx
 fi
 if [ ! -f config.json ]; then
  # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
  curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/config.json
 fi
 # see https://huggingface.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L83
 # and
 # https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
 #
 # af -> American female
 # am -> American male
 # bf -> British female
 # bm -> British male
 voices=(
 af_alloy
 af_aoede
 af_bella
 af_heart
 af_jessica
 af_kore
 af_nicole
 af_nova
 af_river
 af_sarah
 af_sky
 am_adam
 am_echo
 am_eric
 am_fenrir
 am_liam
 am_michael
 am_onyx
 am_puck
 am_santa
 bf_alice
 bf_emma
 bf_isabella
 bf_lily
 bm_daniel
 bm_fable
 bm_george
 bm_lewis
 ef_dora
 em_alex
 ff_siwis
 hf_alpha
 hf_beta
 hm_omega
 hm_psi
 if_sara
 im_nicola
 jf_alpha
 jf_gongitsune
 jf_nezumi
 jf_tebukuro
 jm_kumo
 pf_dora
 pm_alex
 pm_santa
 zf_xiaobei # 东北话
 zf_xiaoni
 zf_xiaoxiao
 zf_xiaoyi
 zm_yunjian
 zm_yunxi
 zm_yunxia
 zm_yunyang
 )
 mkdir -p voices
 for v in ${voices[@]}; do
  if [ ! -f voices/$v.pt ]; then
    curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/voices/$v.pt
  fi
 done
 if [ ! -f ./.add-meta-data.done ]; then
  python3 ./add_meta_data.py
  touch ./.add-meta-data.done
 fi
 if [ ! -f us_gold.json ]; then
  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
 fi
 if [ ! -f us_silver.json ]; then
  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json
 fi
 if [ ! -f gb_gold.json ]; then
  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json
 fi
 if [ ! -f gb_silver.json ]; then
  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json
 fi
 if [ ! -f ./tokens.txt ]; then
  ./generate_tokens.py
 fi
 if [ ! -f ./lexicon.txt ]; then
  ./generate_lexicon.py
 fi
 if [ ! -f ./voices.bin ]; then
  ./generate_voices_bin.py
 fi
 ./test.py
 ls -lh
--- a/scripts/kokoro/v1.0/test.py
+++ b/scripts/kokoro/v1.0/test.py
@@ -0,0 +1,212 @@
 #!/usr/bin/env python3
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
 import re
 import time
 from typing import Dict, List
 import jieba
 import numpy as np
 import onnxruntime as ort
 import soundfile as sf
 import torch
 from misaki import zh
 try:
    from piper_phonemize import phonemize_espeak
 except Exception as ex:
    raise RuntimeError(
        f"{ex}\nPlease run\n"
        "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
    )
 def show(filename):
    session_opts = ort.SessionOptions()
    session_opts.log_severity_level = 3
    sess = ort.InferenceSession(filename, session_opts)
    for i in sess.get_inputs():
        print(i)
    print("-----")
    for i in sess.get_outputs():
        print(i)
 """
 NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
 NodeArg(name='style', type='tensor(float)', shape=[1, 256])
 NodeArg(name='speed', type='tensor(float)', shape=[1])
 -----
 NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
 """
 def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
    embedding = (
        np.fromfile(voices_bin, dtype="uint8")
        .view(np.float32)
        .reshape(len(speaker_names), *dim)
    )
    print("embedding.shape", embedding.shape)
    ans = dict()
    for i in range(len(speaker_names)):
        ans[speaker_names[i]] = embedding[i]
    return ans
 def load_tokens(filename: str) -> Dict[str, int]:
    ans = dict()
    with open(filename, encoding="utf-8") as f:
        for line in f:
            fields = line.strip().split()
            if len(fields) == 2:
                token, idx = fields
                ans[token] = int(idx)
            else:
                assert len(fields) == 1, (len(fields), line)
                ans[" "] = int(fields[0])
    return ans
 def load_lexicon(filename: str) -> Dict[str, List[str]]:
    ans = dict()
    for lexicon in filename.split(","):
        print(lexicon)
        with open(lexicon, encoding="utf-8") as f:
            for line in f:
                w, tokens = line.strip().split(" ", maxsplit=1)
                ans[w] = "".join(tokens.split())
    return ans
 class OnnxModel:
    def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str):
        session_opts = ort.SessionOptions()
        session_opts.inter_op_num_threads = 1
        session_opts.intra_op_num_threads = 1
        self.session_opts = session_opts
        self.model = ort.InferenceSession(
            model_filename,
            sess_options=self.session_opts,
            providers=["CPUExecutionProvider"],
        )
        self.token2id = load_tokens(tokens)
        self.word2tokens = load_lexicon(lexicon)
        meta = self.model.get_modelmeta().custom_metadata_map
        print(meta)
        dim = list(map(int, meta["style_dim"].split(",")))
        speaker_names = meta["speaker_names"].split(",")
        self.voices = load_voices(
            speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
        )
        self.sample_rate = int(meta["sample_rate"])
        print(list(self.voices.keys()))
        self.sample_rate = 24000
        self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
    def __call__(self, text: str, voice: str):
        punctuations = ';:,.!?-…()"“”'
        text = text.lower()
        g2p = zh.ZHG2P()
        tokens = ""
        for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text):
            if ord(t[0]) < 0x7F:
                for w in t.split():
                    while w:
                        if w[0] in punctuations:
                            tokens += w[0] + " "
                            w = w[1:]
                            continue
                        if w[-1] in punctuations:
                            if w[:-1] in self.word2tokens:
                                tokens += self.word2tokens[w[:-1]]
                                tokens += w[-1]
                        else:
                            if w in self.word2tokens:
                                tokens += self.word2tokens[w]
                            else:
                                print(f"Use espeak-ng for word {w}")
                                tokens += "".join(phonemize_espeak(w, "en-us")[0])
                        tokens += " "
                        break
            else:
                # Chinese
                for w in jieba.cut(t):
                    if w in self.word2tokens:
                        tokens += self.word2tokens[w]
                    else:
                        for i in w:
                            if i in self.word2tokens:
                                tokens += self.word2tokens[i]
                            else:
                                print(f"skip {i}")
        token_ids = [self.token2id[i] for i in tokens]
        token_ids = token_ids[: self.max_len]
        style = self.voices[voice][len(token_ids)]
        token_ids = [0, *token_ids, 0]
        token_ids = np.array([token_ids], dtype=np.int64)
        speed = np.array([1.0], dtype=np.float32)
        audio = self.model.run(
            [
                self.model.get_outputs()[0].name,
            ],
            {
                self.model.get_inputs()[0].name: token_ids,
                self.model.get_inputs()[1].name: style,
                self.model.get_inputs()[2].name: speed,
            },
        )[0]
        return audio
 def main():
    m = OnnxModel(
        model_filename="./kokoro.onnx",
        tokens="./tokens.txt",
        lexicon="./lexicon-gb-en.txt,./lexicon-zh.txt",
        voices_bin="./voices.bin",
    )
    text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?"
    text = text.lower()
    voice = "bf_alice"
    start = time.time()
    audio = m(text, voice=voice)
    end = time.time()
    elapsed_seconds = end - start
    audio_duration = len(audio) / m.sample_rate
    real_time_factor = elapsed_seconds / audio_duration
    filename = f"kokoro_v1.0_{voice}_zh_en.wav"
    sf.write(
        filename,
        audio,
        samplerate=m.sample_rate,
        subtype="PCM_16",
    )
    print(f" Saved to {filename}")
    print(f" Elapsed seconds: {elapsed_seconds:.3f}")
    print(f" Audio duration in seconds: {audio_duration:.3f}")
    print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
 if __name__ == "__main__":
    main()
		`@@ -0,0 +1,3 @@`
							`# Introduction`

							`This directory is for kokoro v1.0`