Add Kokoro v1.1-zh (#1942)

2025-02-28 15:47:59 +08:00
parent f5dfcf8d2f
commit dfcbc8d40b
20 changed files with 897 additions and 61 deletions
--- a/scripts/kokoro/.gitignore
+++ b/scripts/kokoro/.gitignore
@@ -1,3 +1,5 @@
 voices.json
 voices.bin
 README-new.md
+lexicon-*.txt
+config.json
--- a/scripts/kokoro/v1.0/add_meta_data.py
+++ b/scripts/kokoro/v1.0/add_meta_data.py
@@ -2,11 +2,6 @@
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)


-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
 import onnx
 import torch

--- a/scripts/kokoro/v1.0/generate_lexicon_en.py
+++ b/scripts/kokoro/v1.0/generate_lexicon_en.py
@@ -4,19 +4,6 @@
 import json
 from typing import List, Tuple

-from misaki import zh
-from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
-
-user_dict = {
-    "还田": [["huan2"], ["tian2"]],
-    "行长": [["hang2"], ["zhang3"]],
-    "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
-}
-
-load_phrases_dict(user_dict)
-
-phrases_dict.phrases_dict.update(**user_dict)
-

 def generate_english_lexicon(kind: str):
    assert kind in ("us", "gb"), kind
@@ -59,28 +46,6 @@ def generate_english_lexicon(kind: str):
    return list(user_defined_lower.items()) + list(lexicon.items())


-def generate_chinese_lexicon():
-    word_dict = pinyin_dict.pinyin_dict
-    phrases = phrases_dict.phrases_dict
-
-    g2p = zh.ZHG2P()
-    lexicon = []
-
-    for key in word_dict:
-        if not (0x4E00 <= key <= 0x9FFF):
-            continue
-        w = chr(key)
-        tokens: str = g2p.word2ipa(w)
-        tokens = tokens.replace(chr(815), "")
-        lexicon.append((w, tokens))
-
-    for key in phrases:
-        tokens: str = g2p.word2ipa(key)
-        tokens = tokens.replace(chr(815), "")
-        lexicon.append((key, tokens))
-    return lexicon
-
-
 def save(filename: str, lexicon: List[Tuple[str, str]]):
    with open(filename, "w", encoding="utf-8") as f:
        for word, phones in lexicon:
@@ -91,11 +56,9 @@ def save(filename: str, lexicon: List[Tuple[str, str]]):
 def main():
    us = generate_english_lexicon("us")
    gb = generate_english_lexicon("gb")
-    zh = generate_chinese_lexicon()

    save("lexicon-us-en.txt", us)
    save("lexicon-gb-en.txt", gb)
-    save("lexicon-zh.txt", zh)


 if __name__ == "__main__":
--- a/scripts/kokoro/v1.0/generate_lexicon_zh.py
+++ b/scripts/kokoro/v1.0/generate_lexicon_zh.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+from typing import List, Tuple
+
+from misaki import zh
+from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
+
+user_dict = {
+    "还田": [["huan2"], ["tian2"]],
+    "行长": [["hang2"], ["zhang3"]],
+    "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
+}
+
+load_phrases_dict(user_dict)
+
+phrases_dict.phrases_dict.update(**user_dict)
+
+
+def generate_chinese_lexicon():
+    word_dict = pinyin_dict.pinyin_dict
+    phrases = phrases_dict.phrases_dict
+
+    g2p = zh.ZHG2P()
+    lexicon = []
+
+    for key in word_dict:
+        if not (0x4E00 <= key <= 0x9FFF):
+            continue
+        w = chr(key)
+        tokens: str = g2p.word2ipa(w)
+        tokens = tokens.replace(chr(815), "")
+        lexicon.append((w, tokens))
+
+    for key in phrases:
+        tokens: str = g2p.word2ipa(key)
+        tokens = tokens.replace(chr(815), "")
+        lexicon.append((key, tokens))
+    return lexicon
+
+
+def save(filename: str, lexicon: List[Tuple[str, str]]):
+    with open(filename, "w", encoding="utf-8") as f:
+        for word, phones in lexicon:
+            tokens = " ".join(list(phones))
+            f.write(f"{word} {tokens}\n")
+
+
+def main():
+    zh = generate_chinese_lexicon()
+
+    save("lexicon-zh.txt", zh)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.0/run.sh
+++ b/scripts/kokoro/v1.0/run.sh
@@ -111,7 +111,11 @@ if [ ! -f ./tokens.txt ]; then
 fi

 if [ ! -f ./lexicon-zh.txt ]; then
-  ./generate_lexicon.py
+  ./generate_lexicon_zh.py
+fi
+
+if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
+  ./generate_lexicon_en.py
 fi

 if [ ! -f ./voices.bin ]; then
--- a/scripts/kokoro/v1.0/test.py
+++ b/scripts/kokoro/v1.0/test.py
@@ -10,8 +10,6 @@ import jieba
 import numpy as np
 import onnxruntime as ort
 import soundfile as sf
-import torch
-from misaki import zh

 try:
    from piper_phonemize import phonemize_espeak
@@ -114,7 +112,6 @@ class OnnxModel:
    def __call__(self, text: str, voice: str):
        punctuations = ';:,.!?-…()"“”'
        text = text.lower()
-        g2p = zh.ZHG2P()

        tokens = ""

--- a/scripts/kokoro/v1.1-zh/README.md
+++ b/scripts/kokoro/v1.1-zh/README.md
@@ -0,0 +1,5 @@
+# Introduction
+
+This directory is for kokoro v1.1-zh.
+
+See also https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh
--- a/scripts/kokoro/v1.1-zh/add_meta_data.py
+++ b/scripts/kokoro/v1.1-zh/add_meta_data.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+
+import onnx
+import torch
+
+from generate_voices_bin import speaker2id
+
+
+def main():
+    model = onnx.load("./kokoro.onnx")
+    style = torch.load("./voices/zf_001.pt", weights_only=True, map_location="cpu")
+
+    id2speaker_str = ""
+    speaker2id_str = ""
+    sep = ""
+    for s, i in speaker2id.items():
+        speaker2id_str += f"{sep}{s}->{i}"
+        id2speaker_str += f"{sep}{i}->{s}"
+        sep = ","
+
+    meta_data = {
+        "model_type": "kokoro",
+        "language": "multi-lang, e.g., English, Chinese",
+        "has_espeak": 1,
+        "sample_rate": 24000,
+        "version": 2,
+        "voice": "en-us",
+        "style_dim": ",".join(map(str, style.shape)),
+        "n_speakers": len(speaker2id),
+        "id2speaker": id2speaker_str,
+        "speaker2id": speaker2id_str,
+        "speaker_names": ",".join(map(str, speaker2id.keys())),
+        "model_url": "https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh",
+        "maintainer": "k2-fsa",
+        "comment": "This is Kokoro v1.1-zh, a multilingual TTS model, supporting English, Chinese.",
+    }
+
+    print(model.metadata_props)
+
+    while len(model.metadata_props):
+        model.metadata_props.pop()
+
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = str(value)
+    print("--------------------")
+
+    print(model.metadata_props)
+
+    onnx.save(model, "./kokoro.onnx")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.1-zh/dynamic_quantization.py
+++ b/scripts/kokoro/v1.1-zh/dynamic_quantization.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+import argparse
+
+import onnxruntime
+from onnxruntime.quantization import QuantType, quantize_dynamic
+
+
+def show(filename):
+    session_opts = onnxruntime.SessionOptions()
+    session_opts.log_severity_level = 3
+    sess = onnxruntime.InferenceSession(filename, session_opts)
+    for i in sess.get_inputs():
+        print(i)
+
+    print("-----")
+
+    for i in sess.get_outputs():
+        print(i)
+
+
+"""
+NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
+NodeArg(name='style', type='tensor(float)', shape=[1, 256])
+NodeArg(name='speed', type='tensor(float)', shape=[1])
+-----
+NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
+"""
+
+
+def main():
+    show("./kokoro.onnx")
+
+    quantize_dynamic(
+        model_input="kokoro.onnx",
+        model_output="kokoro.int8.onnx",
+        #  op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QUInt8,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.1-zh/export_onnx.py
+++ b/scripts/kokoro/v1.1-zh/export_onnx.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+import json
+
+import torch
+from kokoro import KModel
+from kokoro.model import KModelForONNX
+
+
+@torch.no_grad()
+def main():
+    with open("config.json") as f:
+        config = json.load(f)
+
+    model = (
+        KModel(
+            repo_id="not-used-any-value-is-ok",
+            model="kokoro-v1_1-zh.pth",
+            config=config,
+            disable_complex=True,
+        )
+        .to("cpu")
+        .eval()
+    )
+
+    x = torch.randint(1, 100, (48,)).numpy()
+    x = torch.LongTensor([[0, *x, 0]])
+
+    style = torch.rand(1, 256, dtype=torch.float32)
+    speed = torch.rand(1)
+
+    print(x.shape, x.dtype)
+    print(style.shape, style.dtype)
+    print(speed, speed.dtype)
+
+    model2 = KModelForONNX(model)
+
+    torch.onnx.export(
+        model2,
+        (x, style, speed),
+        "kokoro.onnx",
+        input_names=["tokens", "style", "speed"],
+        output_names=["audio"],
+        dynamic_axes={
+            "tokens": {1: "sequence_length"},
+            "audio": {0: "audio_length"},
+        },
+        opset_version=14,  # minimum working version for this kokoro model is 14
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.1-zh/generate_lexicon_en.py
+++ b/scripts/kokoro/v1.1-zh/generate_lexicon_en.py
@@ -0,0 +1 @@
+../v1.0/generate_lexicon_en.py
--- a/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py
+++ b/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import re
+from typing import List, Tuple
+
+from misaki import zh
+from misaki.token import MToken
+from misaki.zh_frontend import ZH_MAP
+from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
+
+user_dict = {
+    "还田": [["huan2"], ["tian2"]],
+    "行长": [["hang2"], ["zhang3"]],
+    "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
+}
+
+load_phrases_dict(user_dict)
+
+phrases_dict.phrases_dict.update(**user_dict)
+
+
+def process_text(self, text, with_erhua=True):
+    """
+    This function is modified from
+    https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155
+
+    Note that we have removed jieba.posseg.lcut().
+    """
+    seg_cut = [(text, "v")]
+    seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+    tokens = []
+    seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+    initials = []
+    finals = []
+    # pypinyin, g2pM
+    for word, pos in seg_cut:
+        if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
+            pos = "X"
+        elif pos != "x" and word in self.punc:
+            pos = "x"
+        tk = MToken(text=word, tag=pos, whitespace="")
+        if pos in ("x", "eng"):
+            if not word.isspace():
+                if pos == "x" and word in self.punc:
+                    tk.phonemes = word
+                tokens.append(tk)
+            elif tokens:
+                tokens[-1].whitespace += word
+            continue
+        elif (
+            tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
+        ):
+            tokens[-1].whitespace = "/"
+
+        # g2p
+        sub_initials, sub_finals = self._get_initials_finals(word)
+        # tone sandhi
+        sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
+        # er hua
+        if with_erhua:
+            sub_initials, sub_finals = self._merge_erhua(
+                sub_initials, sub_finals, word, pos
+            )
+
+        initials.append(sub_initials)
+        finals.append(sub_finals)
+        # assert len(sub_initials) == len(sub_finals) == len(word)
+
+        # sum(iterable[, start])
+        # initials = sum(initials, [])
+        # finals = sum(finals, [])
+
+        phones = []
+        for c, v in zip(sub_initials, sub_finals):
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c:
+                phones.append(c)
+            # replace punctuation by ` `
+            # if c and c in self.punc:
+            #     phones.append(c)
+            if v and (v not in self.punc or v != c):  # and v not in self.rhy_phns:
+                phones.append(v)
+        phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
+        phones = re.sub(r"(?=\d)", "_", phones).split("_")
+        tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
+        tokens.append(tk)
+
+    result = "".join(
+        (self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
+        for tk in tokens
+    )
+
+    return result, tokens
+
+
+def generate_chinese_lexicon():
+    word_dict = pinyin_dict.pinyin_dict
+    phrases = phrases_dict.phrases_dict
+
+    g2p = zh.ZHG2P(version="1.1")
+
+    lexicon = []
+    for key in word_dict:
+        if not (0x4E00 <= key <= 0x9FFF):
+            continue
+        w = chr(key)
+        tokens: str = process_text(g2p.frontend, w)[0]
+        lexicon.append((w, tokens))
+
+    for key in phrases:
+        tokens: str = process_text(g2p.frontend, key)[0]
+        lexicon.append((key, tokens))
+    return lexicon
+
+
+def save(filename: str, lexicon: List[Tuple[str, str]]):
+    with open(filename, "w", encoding="utf-8") as f:
+        for word, phones in lexicon:
+            tokens = " ".join(list(phones))
+            f.write(f"{word} {tokens}\n")
+
+
+def main():
+    zh = generate_chinese_lexicon()
+
+    save("lexicon-zh.txt", zh)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.1-zh/generate_tokens.py
+++ b/scripts/kokoro/v1.1-zh/generate_tokens.py
@@ -0,0 +1 @@
+../v1.0/generate_tokens.py
--- a/scripts/kokoro/v1.1-zh/generate_voices_bin.py
+++ b/scripts/kokoro/v1.1-zh/generate_voices_bin.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+import torch
+from pathlib import Path
+
+
+speakers = [
+    "af_maple",
+    "af_sol",
+    "bf_vale",
+]
+for i in range(1, 99 + 1):
+    name = "zf_{:03d}".format(i)
+    if Path(f"voices/{name}.pt").is_file():
+        speakers.append(name)
+
+for i in range(9, 100 + 1):
+    name = "zm_{:03d}".format(i)
+    if Path(f"voices/{name}.pt").is_file():
+        speakers.append(name)
+
+
+id2speaker = {index: value for index, value in enumerate(speakers)}
+
+speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
+
+
+def main():
+    if Path("./voices.bin").is_file():
+        print("./voices.bin exists - skip")
+        return
+
+    with open("voices.bin", "wb") as f:
+        for _, speaker in id2speaker.items():
+            m = torch.load(
+                f"voices/{speaker}.pt",
+                weights_only=True,
+                map_location="cpu",
+            ).numpy()
+            # m.shape (510, 1, 256)
+
+            f.write(m.tobytes())
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.1-zh/run.sh
+++ b/scripts/kokoro/v1.1-zh/run.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+set -ex
+
+if [ ! -f kokoro-v1_1-zh.pth ]; then
+  curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/kokoro-v1_1-zh.pth
+fi
+
+
+if [ ! -f config.json ]; then
+  # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
+  curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json
+fi
+
+voices=(
+af_maple
+af_sol
+bf_vale
+)
+# zf_001-zf_099
+for i in $(seq 1 99); do
+  a=$(printf "zf_%03d" $i)
+  voices+=($a)
+done
+
+# zm_009-zm_100
+for i in $(seq 9 100); do
+  a=$(printf "zm_%03d" $i)
+  voices+=($a)
+done
+
+echo ${voices[@]} # all elements
+echo ${#voices[@]} # length
+
+mkdir -p voices
+
+for v in ${voices[@]}; do
+  if [ ! -f voices/$v.pt ]; then
+    curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/voices/$v.pt
+  fi
+done
+pushd voices
+find . -type f -size -10k -exec rm -v {} +
+ls -lh
+du -h -d1 .
+popd
+
+if [ ! -f ./kokoro.onnx ]; then
+  python3 ./export_onnx.py
+fi
+
+if [ ! -f ./.add-meta-data.done ]; then
+  python3 ./add_meta_data.py
+  touch ./.add-meta-data.done
+fi
+
+if [ ! -f ./kokoro.int8.onnx ]; then
+  python3 ./dynamic_quantization.py
+fi
+
+if [ ! -f us_gold.json ]; then
+  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
+fi
+
+if [ ! -f us_silver.json ]; then
+  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json
+fi
+
+if [ ! -f gb_gold.json ]; then
+  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json
+fi
+
+if [ ! -f gb_silver.json ]; then
+  curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json
+fi
+
+if [ ! -f ./tokens.txt ]; then
+  ./generate_tokens.py
+fi
+
+if [ ! -f ./lexicon-zh.txt ]; then
+  ./generate_lexicon_zh.py
+fi
+
+if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
+  ./generate_lexicon_en.py
+fi
+
+if [ ! -f ./voices.bin ]; then
+  ./generate_voices_bin.py
+fi
+
+./test.py
+ls -lh
--- a/scripts/kokoro/v1.1-zh/test.py
+++ b/scripts/kokoro/v1.1-zh/test.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+
+import re
+import time
+from typing import Dict, List
+
+import jieba
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+
+try:
+    from piper_phonemize import phonemize_espeak
+except Exception as ex:
+    raise RuntimeError(
+        f"{ex}\nPlease run\n"
+        "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
+    )
+
+
+def show(filename):
+    session_opts = ort.SessionOptions()
+    session_opts.log_severity_level = 3
+    sess = ort.InferenceSession(filename, session_opts)
+    for i in sess.get_inputs():
+        print(i)
+
+    print("-----")
+
+    for i in sess.get_outputs():
+        print(i)
+
+
+"""
+NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
+NodeArg(name='style', type='tensor(float)', shape=[1, 256])
+NodeArg(name='speed', type='tensor(float)', shape=[1])
+-----
+NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
+"""
+
+
+def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
+    embedding = (
+        np.fromfile(voices_bin, dtype="uint8")
+        .view(np.float32)
+        .reshape(len(speaker_names), *dim)
+    )
+    print("embedding.shape", embedding.shape)
+    ans = dict()
+    for i in range(len(speaker_names)):
+        ans[speaker_names[i]] = embedding[i]
+
+    return ans
+
+
+def load_tokens(filename: str) -> Dict[str, int]:
+    ans = dict()
+    with open(filename, encoding="utf-8") as f:
+        for line in f:
+            fields = line.strip().split()
+            if len(fields) == 2:
+                token, idx = fields
+                ans[token] = int(idx)
+            else:
+                assert len(fields) == 1, (len(fields), line)
+                ans[" "] = int(fields[0])
+    return ans
+
+
+def load_lexicon(filename: str) -> Dict[str, List[str]]:
+    ans = dict()
+    for lexicon in filename.split(","):
+        print(lexicon)
+        with open(lexicon, encoding="utf-8") as f:
+            for line in f:
+                w, tokens = line.strip().split(" ", maxsplit=1)
+                ans[w] = "".join(tokens.split())
+    return ans
+
+
+class OnnxModel:
+    def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 3
+        session_opts.intra_op_num_threads = 3
+
+        self.session_opts = session_opts
+        self.model = ort.InferenceSession(
+            model_filename,
+            sess_options=self.session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+        self.token2id = load_tokens(tokens)
+        self.word2tokens = load_lexicon(lexicon)
+
+        meta = self.model.get_modelmeta().custom_metadata_map
+        print(meta)
+        dim = list(map(int, meta["style_dim"].split(",")))
+        speaker_names = meta["speaker_names"].split(",")
+        self.voices = load_voices(
+            speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
+        )
+        self.sample_rate = int(meta["sample_rate"])
+        print(list(self.voices.keys()))
+
+        self.sample_rate = 24000
+        self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
+
+    def __call__(self, text: str, voice: str):
+        punctuations = ';:,.!?-…()"“”'
+        text = text.lower()
+
+        tokens = ""
+
+        for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text):
+            if ord(t[0]) < 0x7F:
+                for w in t.split():
+                    while w:
+                        if w[0] in punctuations:
+                            tokens += w[0] + " "
+                            w = w[1:]
+                            continue
+
+                        if w[-1] in punctuations:
+                            if w[:-1] in self.word2tokens:
+                                tokens += self.word2tokens[w[:-1]]
+                                tokens += w[-1]
+                        else:
+                            if w in self.word2tokens:
+                                tokens += self.word2tokens[w]
+                            else:
+                                print(f"Use espeak-ng for word {w}")
+                                tokens += "".join(phonemize_espeak(w, "en-us")[0])
+
+                        tokens += " "
+                        break
+            else:
+                # Chinese
+                for w in jieba.cut(t):
+                    if w in self.word2tokens:
+                        tokens += self.word2tokens[w]
+                    else:
+                        for i in w:
+                            if i in self.word2tokens:
+                                tokens += self.word2tokens[i]
+                            else:
+                                print(f"skip {i}")
+
+        token_ids = [self.token2id[i] for i in tokens]
+        token_ids = token_ids[: self.max_len]
+
+        style = self.voices[voice][len(token_ids)]
+
+        token_ids = [0, *token_ids, 0]
+        token_ids = np.array([token_ids], dtype=np.int64)
+
+        speed = np.array([1.0], dtype=np.float32)
+
+        audio = self.model.run(
+            [
+                self.model.get_outputs()[0].name,
+            ],
+            {
+                self.model.get_inputs()[0].name: token_ids,
+                self.model.get_inputs()[1].name: style,
+                self.model.get_inputs()[2].name: speed,
+            },
+        )[0]
+        return audio
+
+
+def main():
+    m = OnnxModel(
+        model_filename="./kokoro.onnx",
+        tokens="./tokens.txt",
+        lexicon="./lexicon-us-en.txt,./lexicon-zh.txt",
+        voices_bin="./voices.bin",
+    )
+    text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?"
+
+    text = text.lower()
+
+    voice = "zf_001"
+    start = time.time()
+    audio = m(text, voice=voice)
+    end = time.time()
+
+    elapsed_seconds = end - start
+    audio_duration = len(audio) / m.sample_rate
+    real_time_factor = elapsed_seconds / audio_duration
+
+    filename = f"kokoro_v1.1_{voice}_zh_en.wav"
+    sf.write(
+        filename,
+        audio,
+        samplerate=m.sample_rate,
+        subtype="PCM_16",
+    )
+    print(f" Saved to {filename}")
+    print(f" Elapsed seconds: {elapsed_seconds:.3f}")
+    print(f" Audio duration in seconds: {audio_duration:.3f}")
+    print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
+
+
+if __name__ == "__main__":
+    main()