Add Kokoro v1.1-zh (#1942)

This commit is contained in:
Fangjun Kuang
2025-02-28 15:47:59 +08:00
committed by GitHub
parent f5dfcf8d2f
commit dfcbc8d40b
20 changed files with 897 additions and 61 deletions

View File

@@ -3,7 +3,7 @@ name: export-kokoro-to-onnx
on: on:
push: push:
branches: branches:
- export-kokoro - export-kokoro-2
workflow_dispatch: workflow_dispatch:
@@ -20,7 +20,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
version: ["0.19", "1.0"] version: ["0.19", "1.0", "1.1-zh"]
python-version: ["3.10"] python-version: ["3.10"]
steps: steps:
@@ -34,7 +34,7 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
shell: bash shell: bash
run: | run: |
pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
- name: Run - name: Run
shell: bash shell: bash
@@ -49,9 +49,15 @@ jobs:
elif [[ $v == "1.0" ]]; then elif [[ $v == "1.0" ]]; then
cd v1.0 cd v1.0
./run.sh ./run.sh
elif [[ $v == "1.1-zh" ]]; then
cd v1.1-zh
./run.sh
else
echo "Unknown version $v"
exit 1
fi fi
- name: Collect results ${{ matrix.version }} - name: Collect results 0.19
if: matrix.version == '0.19' if: matrix.version == '0.19'
shell: bash shell: bash
run: | run: |
@@ -71,7 +77,7 @@ jobs:
ls -lh $d.tar.bz2 ls -lh $d.tar.bz2
- name: Collect results ${{ matrix.version }} - name: Collect results 1.0
if: matrix.version == '1.0' if: matrix.version == '1.0'
shell: bash shell: bash
run: | run: |
@@ -87,7 +93,7 @@ jobs:
d=kokoro-multi-lang-v1_0 d=kokoro-multi-lang-v1_0
mkdir $d mkdir $d
cp -a LICENSE $d/LICENSE cp -v LICENSE $d/LICENSE
cp -a espeak-ng-data $d/ cp -a espeak-ng-data $d/
cp -v $src/kokoro.onnx $d/model.onnx cp -v $src/kokoro.onnx $d/model.onnx
cp -v $src/voices.bin $d/ cp -v $src/voices.bin $d/
@@ -105,7 +111,63 @@ jobs:
ls -lh $d.tar.bz2 ls -lh $d.tar.bz2
- name: Publish to huggingface ${{ matrix.version }} - name: Collect results 1.1-zh
if: matrix.version == '1.1-zh'
shell: bash
run: |
curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
tar xvf dict.tar.bz2
rm dict.tar.bz2
curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
src=scripts/kokoro/v1.1-zh
d=kokoro-multi-lang-v1_1
mkdir $d
cp -v LICENSE $d/LICENSE
cp -a espeak-ng-data $d/
cp -v $src/kokoro.onnx $d/model.onnx
cp -v $src/voices.bin $d/
cp -v $src/tokens.txt $d/
cp -v $src/lexicon*.txt $d/
cp -v $src/README.md $d/README.md
cp -av dict $d/
cp -v ./*.fst $d/
ls -lh $d/
echo "---"
ls -lh $d/dict
tar cjfv $d.tar.bz2 $d
rm -rf $d
ls -lh $d.tar.bz2
d=kokoro-int8-multi-lang-v1_1
mkdir $d
cp -v LICENSE $d/LICENSE
cp -a espeak-ng-data $d/
cp -v $src/kokoro.int8.onnx $d/model.int8.onnx
cp -v $src/voices.bin $d/
cp -v $src/tokens.txt $d/
cp -v $src/lexicon*.txt $d/
cp -v $src/README.md $d/README.md
cp -av dict $d/
cp -v ./*.fst $d/
ls -lh $d/
echo "---"
ls -lh $d/dict
tar cjfv $d.tar.bz2 $d
rm -rf $d
ls -lh $d.tar.bz2
echo "---"
ls -lh *.tar.bz2
- name: Publish to huggingface 0.19
if: matrix.version == '0.19' if: matrix.version == '0.19'
env: env:
HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -154,7 +216,7 @@ jobs:
git commit -m "add models" git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
- name: Publish to huggingface ${{ matrix.version }} - name: Publish to huggingface 1.0
if: matrix.version == '1.0' if: matrix.version == '1.0'
env: env:
HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -205,6 +267,108 @@ jobs:
git commit -m "add models" git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true
- name: Publish to huggingface 1.1-zh
if: matrix.version == '1.1-zh'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 huggingface
cd huggingface
rm -rf ./*
git fetch
git pull
git lfs track "cmn_dict"
git lfs track "ru_dict"
git lfs track "*.wav"
git lfs track "lexicon*.txt"
cp -a ../espeak-ng-data ./
cp -v ../scripts/kokoro/v1.1-zh/kokoro.onnx ./model.onnx
cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md
cp -v ../LICENSE ./
cp -av ../dict ./
cp -v ../*.fst ./
git lfs track "*.onnx"
git add .
ls -lh
git status
git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 main || true
- name: Publish to huggingface 1.1-zh-int8
if: matrix.version == '1.1-zh'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 huggingface
cd huggingface
rm -rf ./*
git fetch
git pull
git lfs track "cmn_dict"
git lfs track "ru_dict"
git lfs track "*.wav"
git lfs track "lexicon*.txt"
cp -a ../espeak-ng-data ./
cp -v ../scripts/kokoro/v1.1-zh/kokoro.int8.onnx ./model.int8.onnx
cp -v ../scripts/kokoro/v1.1-zh/tokens.txt .
cp -v ../scripts/kokoro/v1.1-zh/voices.bin .
cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt .
cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md
cp -v ../LICENSE ./
cp -av ../dict ./
cp -v ../*.fst ./
git lfs track "*.onnx"
git add .
ls -lh
git status
git commit -m "add models"
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 main || true
- name: Release - name: Release
if: github.repository_owner == 'csukuangfj' if: github.repository_owner == 'csukuangfj'
uses: svenstaro/upload-release-action@v2 uses: svenstaro/upload-release-action@v2

View File

@@ -438,7 +438,17 @@ def get_kokoro_models() -> List[TtsModel]:
model_dir="kokoro-multi-lang-v1_0", model_dir="kokoro-multi-lang-v1_0",
model_name="model.onnx", model_name="model.onnx",
lang="en", lang="en",
) ),
TtsModel(
model_dir="kokoro-multi-lang-v1_1",
model_name="model.onnx",
lang="en",
),
TtsModel(
model_dir="kokoro-int8-multi-lang-v1_1",
model_name="model.int8.onnx",
lang="en",
),
] ]
for m in multi_lingual_models: for m in multi_lingual_models:
m.data_dir = f"{m.model_dir}/espeak-ng-data" m.data_dir = f"{m.model_dir}/espeak-ng-data"

View File

@@ -1,3 +1,5 @@
voices.json voices.json
voices.bin voices.bin
README-new.md README-new.md
lexicon-*.txt
config.json

View File

@@ -2,11 +2,6 @@
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) # Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import argparse
import json
from pathlib import Path
import numpy as np
import onnx import onnx
import torch import torch

View File

@@ -4,19 +4,6 @@
import json import json
from typing import List, Tuple from typing import List, Tuple
from misaki import zh
from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
user_dict = {
"还田": [["huan2"], ["tian2"]],
"行长": [["hang2"], ["zhang3"]],
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}
load_phrases_dict(user_dict)
phrases_dict.phrases_dict.update(**user_dict)
def generate_english_lexicon(kind: str): def generate_english_lexicon(kind: str):
assert kind in ("us", "gb"), kind assert kind in ("us", "gb"), kind
@@ -59,28 +46,6 @@ def generate_english_lexicon(kind: str):
return list(user_defined_lower.items()) + list(lexicon.items()) return list(user_defined_lower.items()) + list(lexicon.items())
def generate_chinese_lexicon():
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
g2p = zh.ZHG2P()
lexicon = []
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens: str = g2p.word2ipa(w)
tokens = tokens.replace(chr(815), "")
lexicon.append((w, tokens))
for key in phrases:
tokens: str = g2p.word2ipa(key)
tokens = tokens.replace(chr(815), "")
lexicon.append((key, tokens))
return lexicon
def save(filename: str, lexicon: List[Tuple[str, str]]): def save(filename: str, lexicon: List[Tuple[str, str]]):
with open(filename, "w", encoding="utf-8") as f: with open(filename, "w", encoding="utf-8") as f:
for word, phones in lexicon: for word, phones in lexicon:
@@ -91,11 +56,9 @@ def save(filename: str, lexicon: List[Tuple[str, str]]):
def main(): def main():
us = generate_english_lexicon("us") us = generate_english_lexicon("us")
gb = generate_english_lexicon("gb") gb = generate_english_lexicon("gb")
zh = generate_chinese_lexicon()
save("lexicon-us-en.txt", us) save("lexicon-us-en.txt", us)
save("lexicon-gb-en.txt", gb) save("lexicon-gb-en.txt", gb)
save("lexicon-zh.txt", zh)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
from typing import List, Tuple
from misaki import zh
from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
user_dict = {
"还田": [["huan2"], ["tian2"]],
"行长": [["hang2"], ["zhang3"]],
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}
load_phrases_dict(user_dict)
phrases_dict.phrases_dict.update(**user_dict)
def generate_chinese_lexicon():
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
g2p = zh.ZHG2P()
lexicon = []
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens: str = g2p.word2ipa(w)
tokens = tokens.replace(chr(815), "")
lexicon.append((w, tokens))
for key in phrases:
tokens: str = g2p.word2ipa(key)
tokens = tokens.replace(chr(815), "")
lexicon.append((key, tokens))
return lexicon
def save(filename: str, lexicon: List[Tuple[str, str]]):
with open(filename, "w", encoding="utf-8") as f:
for word, phones in lexicon:
tokens = " ".join(list(phones))
f.write(f"{word} {tokens}\n")
def main():
zh = generate_chinese_lexicon()
save("lexicon-zh.txt", zh)
if __name__ == "__main__":
main()

View File

@@ -111,7 +111,11 @@ if [ ! -f ./tokens.txt ]; then
fi fi
if [ ! -f ./lexicon-zh.txt ]; then if [ ! -f ./lexicon-zh.txt ]; then
./generate_lexicon.py ./generate_lexicon_zh.py
fi
if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
./generate_lexicon_en.py
fi fi
if [ ! -f ./voices.bin ]; then if [ ! -f ./voices.bin ]; then

View File

@@ -10,8 +10,6 @@ import jieba
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
import soundfile as sf import soundfile as sf
import torch
from misaki import zh
try: try:
from piper_phonemize import phonemize_espeak from piper_phonemize import phonemize_espeak
@@ -114,7 +112,6 @@ class OnnxModel:
def __call__(self, text: str, voice: str): def __call__(self, text: str, voice: str):
punctuations = ';:,.!?-…()"“”' punctuations = ';:,.!?-…()"“”'
text = text.lower() text = text.lower()
g2p = zh.ZHG2P()
tokens = "" tokens = ""

View File

@@ -0,0 +1,5 @@
# Introduction
This directory is for kokoro v1.1-zh.
See also https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import onnx
import torch
from generate_voices_bin import speaker2id
def main():
model = onnx.load("./kokoro.onnx")
style = torch.load("./voices/zf_001.pt", weights_only=True, map_location="cpu")
id2speaker_str = ""
speaker2id_str = ""
sep = ""
for s, i in speaker2id.items():
speaker2id_str += f"{sep}{s}->{i}"
id2speaker_str += f"{sep}{i}->{s}"
sep = ","
meta_data = {
"model_type": "kokoro",
"language": "multi-lang, e.g., English, Chinese",
"has_espeak": 1,
"sample_rate": 24000,
"version": 2,
"voice": "en-us",
"style_dim": ",".join(map(str, style.shape)),
"n_speakers": len(speaker2id),
"id2speaker": id2speaker_str,
"speaker2id": speaker2id_str,
"speaker_names": ",".join(map(str, speaker2id.keys())),
"model_url": "https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh",
"maintainer": "k2-fsa",
"comment": "This is Kokoro v1.1-zh, a multilingual TTS model, supporting English, Chinese.",
}
print(model.metadata_props)
while len(model.metadata_props):
model.metadata_props.pop()
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
print("--------------------")
print(model.metadata_props)
onnx.save(model, "./kokoro.onnx")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env python3
import argparse
import onnxruntime
from onnxruntime.quantization import QuantType, quantize_dynamic
def show(filename):
session_opts = onnxruntime.SessionOptions()
session_opts.log_severity_level = 3
sess = onnxruntime.InferenceSession(filename, session_opts)
for i in sess.get_inputs():
print(i)
print("-----")
for i in sess.get_outputs():
print(i)
"""
NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
NodeArg(name='style', type='tensor(float)', shape=[1, 256])
NodeArg(name='speed', type='tensor(float)', shape=[1])
-----
NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
"""
def main():
show("./kokoro.onnx")
quantize_dynamic(
model_input="kokoro.onnx",
model_output="kokoro.int8.onnx",
# op_types_to_quantize=["MatMul"],
weight_type=QuantType.QUInt8,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env python3
import json
import torch
from kokoro import KModel
from kokoro.model import KModelForONNX
@torch.no_grad()
def main():
with open("config.json") as f:
config = json.load(f)
model = (
KModel(
repo_id="not-used-any-value-is-ok",
model="kokoro-v1_1-zh.pth",
config=config,
disable_complex=True,
)
.to("cpu")
.eval()
)
x = torch.randint(1, 100, (48,)).numpy()
x = torch.LongTensor([[0, *x, 0]])
style = torch.rand(1, 256, dtype=torch.float32)
speed = torch.rand(1)
print(x.shape, x.dtype)
print(style.shape, style.dtype)
print(speed, speed.dtype)
model2 = KModelForONNX(model)
torch.onnx.export(
model2,
(x, style, speed),
"kokoro.onnx",
input_names=["tokens", "style", "speed"],
output_names=["audio"],
dynamic_axes={
"tokens": {1: "sequence_length"},
"audio": {0: "audio_length"},
},
opset_version=14, # minimum working version for this kokoro model is 14
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1 @@
../v1.0/generate_lexicon_en.py

View File

@@ -0,0 +1,132 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import re
from typing import List, Tuple
from misaki import zh
from misaki.token import MToken
from misaki.zh_frontend import ZH_MAP
from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
user_dict = {
"还田": [["huan2"], ["tian2"]],
"行长": [["hang2"], ["zhang3"]],
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}
load_phrases_dict(user_dict)
phrases_dict.phrases_dict.update(**user_dict)
def process_text(self, text, with_erhua=True):
"""
This function is modified from
https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155
Note that we have removed jieba.posseg.lcut().
"""
seg_cut = [(text, "v")]
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
tokens = []
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
initials = []
finals = []
# pypinyin, g2pM
for word, pos in seg_cut:
if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
pos = "X"
elif pos != "x" and word in self.punc:
pos = "x"
tk = MToken(text=word, tag=pos, whitespace="")
if pos in ("x", "eng"):
if not word.isspace():
if pos == "x" and word in self.punc:
tk.phonemes = word
tokens.append(tk)
elif tokens:
tokens[-1].whitespace += word
continue
elif (
tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
):
tokens[-1].whitespace = "/"
# g2p
sub_initials, sub_finals = self._get_initials_finals(word)
# tone sandhi
sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
# er hua
if with_erhua:
sub_initials, sub_finals = self._merge_erhua(
sub_initials, sub_finals, word, pos
)
initials.append(sub_initials)
finals.append(sub_finals)
# assert len(sub_initials) == len(sub_finals) == len(word)
# sum(iterable[, start])
# initials = sum(initials, [])
# finals = sum(finals, [])
phones = []
for c, v in zip(sub_initials, sub_finals):
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c:
phones.append(c)
# replace punctuation by ` `
# if c and c in self.punc:
# phones.append(c)
if v and (v not in self.punc or v != c): # and v not in self.rhy_phns:
phones.append(v)
phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
phones = re.sub(r"(?=\d)", "_", phones).split("_")
tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
tokens.append(tk)
result = "".join(
(self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
for tk in tokens
)
return result, tokens
def generate_chinese_lexicon():
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
g2p = zh.ZHG2P(version="1.1")
lexicon = []
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens: str = process_text(g2p.frontend, w)[0]
lexicon.append((w, tokens))
for key in phrases:
tokens: str = process_text(g2p.frontend, key)[0]
lexicon.append((key, tokens))
return lexicon
def save(filename: str, lexicon: List[Tuple[str, str]]):
with open(filename, "w", encoding="utf-8") as f:
for word, phones in lexicon:
tokens = " ".join(list(phones))
f.write(f"{word} {tokens}\n")
def main():
zh = generate_chinese_lexicon()
save("lexicon-zh.txt", zh)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1 @@
../v1.0/generate_tokens.py

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import torch
from pathlib import Path
speakers = [
"af_maple",
"af_sol",
"bf_vale",
]
for i in range(1, 99 + 1):
name = "zf_{:03d}".format(i)
if Path(f"voices/{name}.pt").is_file():
speakers.append(name)
for i in range(9, 100 + 1):
name = "zm_{:03d}".format(i)
if Path(f"voices/{name}.pt").is_file():
speakers.append(name)
id2speaker = {index: value for index, value in enumerate(speakers)}
speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
def main():
if Path("./voices.bin").is_file():
print("./voices.bin exists - skip")
return
with open("voices.bin", "wb") as f:
for _, speaker in id2speaker.items():
m = torch.load(
f"voices/{speaker}.pt",
weights_only=True,
map_location="cpu",
).numpy()
# m.shape (510, 1, 256)
f.write(m.tobytes())
if __name__ == "__main__":
main()

95
scripts/kokoro/v1.1-zh/run.sh Executable file
View File

@@ -0,0 +1,95 @@
#!/usr/bin/env bash
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
#
set -ex
if [ ! -f kokoro-v1_1-zh.pth ]; then
curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/kokoro-v1_1-zh.pth
fi
if [ ! -f config.json ]; then
# see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json
fi
voices=(
af_maple
af_sol
bf_vale
)
# zf_001-zf_099
for i in $(seq 1 99); do
a=$(printf "zf_%03d" $i)
voices+=($a)
done
# zm_009-zm_100
for i in $(seq 9 100); do
a=$(printf "zm_%03d" $i)
voices+=($a)
done
echo ${voices[@]} # all elements
echo ${#voices[@]} # length
mkdir -p voices
for v in ${voices[@]}; do
if [ ! -f voices/$v.pt ]; then
curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/voices/$v.pt
fi
done
pushd voices
find . -type f -size -10k -exec rm -v {} +
ls -lh
du -h -d1 .
popd
if [ ! -f ./kokoro.onnx ]; then
python3 ./export_onnx.py
fi
if [ ! -f ./.add-meta-data.done ]; then
python3 ./add_meta_data.py
touch ./.add-meta-data.done
fi
if [ ! -f ./kokoro.int8.onnx ]; then
python3 ./dynamic_quantization.py
fi
if [ ! -f us_gold.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json
fi
if [ ! -f us_silver.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json
fi
if [ ! -f gb_gold.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json
fi
if [ ! -f gb_silver.json ]; then
curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json
fi
if [ ! -f ./tokens.txt ]; then
./generate_tokens.py
fi
if [ ! -f ./lexicon-zh.txt ]; then
./generate_lexicon_zh.py
fi
if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
./generate_lexicon_en.py
fi
if [ ! -f ./voices.bin ]; then
./generate_voices_bin.py
fi
./test.py
ls -lh

209
scripts/kokoro/v1.1-zh/test.py Executable file
View File

@@ -0,0 +1,209 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import re
import time
from typing import Dict, List
import jieba
import numpy as np
import onnxruntime as ort
import soundfile as sf
try:
from piper_phonemize import phonemize_espeak
except Exception as ex:
raise RuntimeError(
f"{ex}\nPlease run\n"
"pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
)
def show(filename):
session_opts = ort.SessionOptions()
session_opts.log_severity_level = 3
sess = ort.InferenceSession(filename, session_opts)
for i in sess.get_inputs():
print(i)
print("-----")
for i in sess.get_outputs():
print(i)
"""
NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length'])
NodeArg(name='style', type='tensor(float)', shape=[1, 256])
NodeArg(name='speed', type='tensor(float)', shape=[1])
-----
NodeArg(name='audio', type='tensor(float)', shape=['audio_length'])
"""
def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str):
embedding = (
np.fromfile(voices_bin, dtype="uint8")
.view(np.float32)
.reshape(len(speaker_names), *dim)
)
print("embedding.shape", embedding.shape)
ans = dict()
for i in range(len(speaker_names)):
ans[speaker_names[i]] = embedding[i]
return ans
def load_tokens(filename: str) -> Dict[str, int]:
ans = dict()
with open(filename, encoding="utf-8") as f:
for line in f:
fields = line.strip().split()
if len(fields) == 2:
token, idx = fields
ans[token] = int(idx)
else:
assert len(fields) == 1, (len(fields), line)
ans[" "] = int(fields[0])
return ans
def load_lexicon(filename: str) -> Dict[str, List[str]]:
ans = dict()
for lexicon in filename.split(","):
print(lexicon)
with open(lexicon, encoding="utf-8") as f:
for line in f:
w, tokens = line.strip().split(" ", maxsplit=1)
ans[w] = "".join(tokens.split())
return ans
class OnnxModel:
def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str):
session_opts = ort.SessionOptions()
session_opts.inter_op_num_threads = 3
session_opts.intra_op_num_threads = 3
self.session_opts = session_opts
self.model = ort.InferenceSession(
model_filename,
sess_options=self.session_opts,
providers=["CPUExecutionProvider"],
)
self.token2id = load_tokens(tokens)
self.word2tokens = load_lexicon(lexicon)
meta = self.model.get_modelmeta().custom_metadata_map
print(meta)
dim = list(map(int, meta["style_dim"].split(",")))
speaker_names = meta["speaker_names"].split(",")
self.voices = load_voices(
speaker_names=speaker_names, dim=dim, voices_bin=voices_bin
)
self.sample_rate = int(meta["sample_rate"])
print(list(self.voices.keys()))
self.sample_rate = 24000
self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1
def __call__(self, text: str, voice: str):
punctuations = ';:,.!?-…()"“”'
text = text.lower()
tokens = ""
for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text):
if ord(t[0]) < 0x7F:
for w in t.split():
while w:
if w[0] in punctuations:
tokens += w[0] + " "
w = w[1:]
continue
if w[-1] in punctuations:
if w[:-1] in self.word2tokens:
tokens += self.word2tokens[w[:-1]]
tokens += w[-1]
else:
if w in self.word2tokens:
tokens += self.word2tokens[w]
else:
print(f"Use espeak-ng for word {w}")
tokens += "".join(phonemize_espeak(w, "en-us")[0])
tokens += " "
break
else:
# Chinese
for w in jieba.cut(t):
if w in self.word2tokens:
tokens += self.word2tokens[w]
else:
for i in w:
if i in self.word2tokens:
tokens += self.word2tokens[i]
else:
print(f"skip {i}")
token_ids = [self.token2id[i] for i in tokens]
token_ids = token_ids[: self.max_len]
style = self.voices[voice][len(token_ids)]
token_ids = [0, *token_ids, 0]
token_ids = np.array([token_ids], dtype=np.int64)
speed = np.array([1.0], dtype=np.float32)
audio = self.model.run(
[
self.model.get_outputs()[0].name,
],
{
self.model.get_inputs()[0].name: token_ids,
self.model.get_inputs()[1].name: style,
self.model.get_inputs()[2].name: speed,
},
)[0]
return audio
def main():
m = OnnxModel(
model_filename="./kokoro.onnx",
tokens="./tokens.txt",
lexicon="./lexicon-us-en.txt,./lexicon-zh.txt",
voices_bin="./voices.bin",
)
text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?"
text = text.lower()
voice = "zf_001"
start = time.time()
audio = m(text, voice=voice)
end = time.time()
elapsed_seconds = end - start
audio_duration = len(audio) / m.sample_rate
real_time_factor = elapsed_seconds / audio_duration
filename = f"kokoro_v1.1_{voice}_zh_en.wav"
sf.write(
filename,
audio,
samplerate=m.sample_rate,
subtype="PCM_16",
)
print(f" Saved to {filename}")
print(f" Elapsed seconds: {elapsed_seconds:.3f}")
print(f" Audio duration in seconds: {audio_duration:.3f}")
print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
if __name__ == "__main__":
main()

View File

@@ -128,15 +128,19 @@ class KokoroMultiLangLexicon::Impl {
} }
for (const auto &ids : ids_vec) { for (const auto &ids : ids_vec) {
if (ids.size() > 4) { if (ids.size() > 10 + 2) {
ans.emplace_back(ids); ans.emplace_back(ids);
} else { } else {
if (ans.empty()) { if (ans.empty()) {
ans.emplace_back(ids); ans.emplace_back(ids);
} else { } else {
ans.back().tokens.back() = ids[1]; if (ans.back().tokens.size() + ids.size() < 50) {
ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2, ans.back().tokens.back() = ids[1];
ids.end()); ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
ids.end());
} else {
ans.emplace_back(ids);
}
} }
} }
} }

View File

@@ -33,8 +33,8 @@ GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const {
if (scale == 1) { if (scale == 1) {
return *this; return *this;
} }
// if the interval is larger than 0.6 second, then we assume it is a pause // if the interval is larger than 0.2 second, then we assume it is a pause
int32_t threshold = static_cast<int32_t>(sample_rate * 0.6); int32_t threshold = static_cast<int32_t>(sample_rate * 0.2);
std::vector<SilenceInterval> intervals; std::vector<SilenceInterval> intervals;
int32_t num_samples = static_cast<int32_t>(samples.size()); int32_t num_samples = static_cast<int32_t>(samples.size());