diff --git a/.github/workflows/export-kokoro.yaml b/.github/workflows/export-kokoro.yaml index e6aae1da..69b77803 100644 --- a/.github/workflows/export-kokoro.yaml +++ b/.github/workflows/export-kokoro.yaml @@ -3,7 +3,7 @@ name: export-kokoro-to-onnx on: push: branches: - - export-kokoro + - export-kokoro-2 workflow_dispatch: @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - version: ["0.19", "1.0"] + version: ["0.19", "1.0", "1.1-zh"] python-version: ["3.10"] steps: @@ -34,7 +34,7 @@ jobs: - name: Install Python dependencies shell: bash run: | - pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch + pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch - name: Run shell: bash @@ -49,9 +49,15 @@ jobs: elif [[ $v == "1.0" ]]; then cd v1.0 ./run.sh + elif [[ $v == "1.1-zh" ]]; then + cd v1.1-zh + ./run.sh + else + echo "Unknown version $v" + exit 1 fi - - name: Collect results ${{ matrix.version }} + - name: Collect results 0.19 if: matrix.version == '0.19' shell: bash run: | @@ -71,7 +77,7 @@ jobs: ls -lh $d.tar.bz2 - - name: Collect results ${{ matrix.version }} + - name: Collect results 1.0 if: matrix.version == '1.0' shell: bash run: | @@ -87,7 +93,7 @@ jobs: d=kokoro-multi-lang-v1_0 mkdir $d - cp -a LICENSE $d/LICENSE + cp -v LICENSE $d/LICENSE cp -a espeak-ng-data $d/ cp -v $src/kokoro.onnx $d/model.onnx cp -v $src/voices.bin $d/ @@ -105,7 +111,63 @@ jobs: ls -lh $d.tar.bz2 - - name: Publish to huggingface ${{ matrix.version }} + - name: Collect results 1.1-zh + if: matrix.version == '1.1-zh' + shell: bash + run: | + curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2 + tar xvf dict.tar.bz2 + rm dict.tar.bz2 + + curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst + curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst + curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst + + src=scripts/kokoro/v1.1-zh + + d=kokoro-multi-lang-v1_1 + mkdir $d + cp -v LICENSE $d/LICENSE + cp -a espeak-ng-data $d/ + cp -v $src/kokoro.onnx $d/model.onnx + cp -v $src/voices.bin $d/ + cp -v $src/tokens.txt $d/ + cp -v $src/lexicon*.txt $d/ + cp -v $src/README.md $d/README.md + cp -av dict $d/ + cp -v ./*.fst $d/ + ls -lh $d/ + echo "---" + ls -lh $d/dict + + tar cjfv $d.tar.bz2 $d + rm -rf $d + ls -lh $d.tar.bz2 + + d=kokoro-int8-multi-lang-v1_1 + mkdir $d + cp -v LICENSE $d/LICENSE + cp -a espeak-ng-data $d/ + cp -v $src/kokoro.int8.onnx $d/model.int8.onnx + cp -v $src/voices.bin $d/ + cp -v $src/tokens.txt $d/ + cp -v $src/lexicon*.txt $d/ + cp -v $src/README.md $d/README.md + cp -av dict $d/ + cp -v ./*.fst $d/ + ls -lh $d/ + echo "---" + ls -lh $d/dict + + tar cjfv $d.tar.bz2 $d + rm -rf $d + ls -lh $d.tar.bz2 + + echo "---" + ls -lh *.tar.bz2 + + + - name: Publish to huggingface 0.19 if: matrix.version == '0.19' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -154,7 +216,7 @@ jobs: git commit -m "add models" git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true - - name: Publish to huggingface ${{ matrix.version }} + - name: Publish to huggingface 1.0 if: matrix.version == '1.0' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -205,6 +267,108 @@ jobs: git commit -m "add models" git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true + - name: Publish to huggingface 1.1-zh + if: matrix.version == '1.1-zh' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 huggingface + cd huggingface + rm -rf ./* + git fetch + git pull + + git lfs track "cmn_dict" + git lfs track "ru_dict" + git lfs track "*.wav" + git lfs track "lexicon*.txt" + + cp -a ../espeak-ng-data ./ + + cp -v ../scripts/kokoro/v1.1-zh/kokoro.onnx ./model.onnx + + + cp -v ../scripts/kokoro/v1.1-zh/tokens.txt . + cp -v ../scripts/kokoro/v1.1-zh/voices.bin . + cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt . + cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md + cp -v ../LICENSE ./ + cp -av ../dict ./ + cp -v ../*.fst ./ + + git lfs track "*.onnx" + git add . + + ls -lh + + git status + + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_1 main || true + + - name: Publish to huggingface 1.1-zh-int8 + if: matrix.version == '1.1-zh' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 huggingface + cd huggingface + rm -rf ./* + git fetch + git pull + + git lfs track "cmn_dict" + git lfs track "ru_dict" + git lfs track "*.wav" + git lfs track "lexicon*.txt" + + cp -a ../espeak-ng-data ./ + + cp -v ../scripts/kokoro/v1.1-zh/kokoro.int8.onnx ./model.int8.onnx + + + cp -v ../scripts/kokoro/v1.1-zh/tokens.txt . + cp -v ../scripts/kokoro/v1.1-zh/voices.bin . + cp -v ../scripts/kokoro/v1.1-zh/lexicon*.txt . + cp -v ../scripts/kokoro/v1.1-zh/README.md ./README.md + cp -v ../LICENSE ./ + cp -av ../dict ./ + cp -v ../*.fst ./ + + git lfs track "*.onnx" + git add . + + ls -lh + + git status + + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-int8-multi-lang-v1_1 main || true + - name: Release if: github.repository_owner == 'csukuangfj' uses: svenstaro/upload-release-action@v2 diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 696455d3..338c06a1 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -438,7 +438,17 @@ def get_kokoro_models() -> List[TtsModel]: model_dir="kokoro-multi-lang-v1_0", model_name="model.onnx", lang="en", - ) + ), + TtsModel( + model_dir="kokoro-multi-lang-v1_1", + model_name="model.onnx", + lang="en", + ), + TtsModel( + model_dir="kokoro-int8-multi-lang-v1_1", + model_name="model.int8.onnx", + lang="en", + ), ] for m in multi_lingual_models: m.data_dir = f"{m.model_dir}/espeak-ng-data" diff --git a/scripts/kokoro/.gitignore b/scripts/kokoro/.gitignore index 3802b517..a4343f2f 100644 --- a/scripts/kokoro/.gitignore +++ b/scripts/kokoro/.gitignore @@ -1,3 +1,5 @@ voices.json voices.bin README-new.md +lexicon-*.txt +config.json diff --git a/scripts/kokoro/v1.0/add_meta_data.py b/scripts/kokoro/v1.0/add_meta_data.py index eab4c4ff..102dce45 100755 --- a/scripts/kokoro/v1.0/add_meta_data.py +++ b/scripts/kokoro/v1.0/add_meta_data.py @@ -2,11 +2,6 @@ # Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) -import argparse -import json -from pathlib import Path - -import numpy as np import onnx import torch diff --git a/scripts/kokoro/v1.0/generate_lexicon.py b/scripts/kokoro/v1.0/generate_lexicon_en.py similarity index 65% rename from scripts/kokoro/v1.0/generate_lexicon.py rename to scripts/kokoro/v1.0/generate_lexicon_en.py index aa37911a..154dac3a 100755 --- a/scripts/kokoro/v1.0/generate_lexicon.py +++ b/scripts/kokoro/v1.0/generate_lexicon_en.py @@ -4,19 +4,6 @@ import json from typing import List, Tuple -from misaki import zh -from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict - -user_dict = { - "还田": [["huan2"], ["tian2"]], - "行长": [["hang2"], ["zhang3"]], - "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]], -} - -load_phrases_dict(user_dict) - -phrases_dict.phrases_dict.update(**user_dict) - def generate_english_lexicon(kind: str): assert kind in ("us", "gb"), kind @@ -59,28 +46,6 @@ def generate_english_lexicon(kind: str): return list(user_defined_lower.items()) + list(lexicon.items()) -def generate_chinese_lexicon(): - word_dict = pinyin_dict.pinyin_dict - phrases = phrases_dict.phrases_dict - - g2p = zh.ZHG2P() - lexicon = [] - - for key in word_dict: - if not (0x4E00 <= key <= 0x9FFF): - continue - w = chr(key) - tokens: str = g2p.word2ipa(w) - tokens = tokens.replace(chr(815), "") - lexicon.append((w, tokens)) - - for key in phrases: - tokens: str = g2p.word2ipa(key) - tokens = tokens.replace(chr(815), "") - lexicon.append((key, tokens)) - return lexicon - - def save(filename: str, lexicon: List[Tuple[str, str]]): with open(filename, "w", encoding="utf-8") as f: for word, phones in lexicon: @@ -91,11 +56,9 @@ def save(filename: str, lexicon: List[Tuple[str, str]]): def main(): us = generate_english_lexicon("us") gb = generate_english_lexicon("gb") - zh = generate_chinese_lexicon() save("lexicon-us-en.txt", us) save("lexicon-gb-en.txt", gb) - save("lexicon-zh.txt", zh) if __name__ == "__main__": diff --git a/scripts/kokoro/v1.0/generate_lexicon_zh.py b/scripts/kokoro/v1.0/generate_lexicon_zh.py new file mode 100755 index 00000000..bd85da71 --- /dev/null +++ b/scripts/kokoro/v1.0/generate_lexicon_zh.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +from typing import List, Tuple + +from misaki import zh +from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict + +user_dict = { + "还田": [["huan2"], ["tian2"]], + "行长": [["hang2"], ["zhang3"]], + "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]], +} + +load_phrases_dict(user_dict) + +phrases_dict.phrases_dict.update(**user_dict) + + +def generate_chinese_lexicon(): + word_dict = pinyin_dict.pinyin_dict + phrases = phrases_dict.phrases_dict + + g2p = zh.ZHG2P() + lexicon = [] + + for key in word_dict: + if not (0x4E00 <= key <= 0x9FFF): + continue + w = chr(key) + tokens: str = g2p.word2ipa(w) + tokens = tokens.replace(chr(815), "") + lexicon.append((w, tokens)) + + for key in phrases: + tokens: str = g2p.word2ipa(key) + tokens = tokens.replace(chr(815), "") + lexicon.append((key, tokens)) + return lexicon + + +def save(filename: str, lexicon: List[Tuple[str, str]]): + with open(filename, "w", encoding="utf-8") as f: + for word, phones in lexicon: + tokens = " ".join(list(phones)) + f.write(f"{word} {tokens}\n") + + +def main(): + zh = generate_chinese_lexicon() + + save("lexicon-zh.txt", zh) + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.0/run.sh b/scripts/kokoro/v1.0/run.sh index d6b3584d..de8048c7 100755 --- a/scripts/kokoro/v1.0/run.sh +++ b/scripts/kokoro/v1.0/run.sh @@ -111,7 +111,11 @@ if [ ! -f ./tokens.txt ]; then fi if [ ! -f ./lexicon-zh.txt ]; then - ./generate_lexicon.py + ./generate_lexicon_zh.py +fi + +if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then + ./generate_lexicon_en.py fi if [ ! -f ./voices.bin ]; then diff --git a/scripts/kokoro/v1.0/test.py b/scripts/kokoro/v1.0/test.py index 304a810e..1180193d 100755 --- a/scripts/kokoro/v1.0/test.py +++ b/scripts/kokoro/v1.0/test.py @@ -10,8 +10,6 @@ import jieba import numpy as np import onnxruntime as ort import soundfile as sf -import torch -from misaki import zh try: from piper_phonemize import phonemize_espeak @@ -114,7 +112,6 @@ class OnnxModel: def __call__(self, text: str, voice: str): punctuations = ';:,.!?-…()"“”' text = text.lower() - g2p = zh.ZHG2P() tokens = "" diff --git a/scripts/kokoro/v1.1-zh/README.md b/scripts/kokoro/v1.1-zh/README.md new file mode 100644 index 00000000..9a76d7b7 --- /dev/null +++ b/scripts/kokoro/v1.1-zh/README.md @@ -0,0 +1,5 @@ +# Introduction + +This directory is for kokoro v1.1-zh. + +See also https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh diff --git a/scripts/kokoro/v1.1-zh/add_meta_data.py b/scripts/kokoro/v1.1-zh/add_meta_data.py new file mode 100755 index 00000000..6982d5bd --- /dev/null +++ b/scripts/kokoro/v1.1-zh/add_meta_data.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import onnx +import torch + +from generate_voices_bin import speaker2id + + +def main(): + model = onnx.load("./kokoro.onnx") + style = torch.load("./voices/zf_001.pt", weights_only=True, map_location="cpu") + + id2speaker_str = "" + speaker2id_str = "" + sep = "" + for s, i in speaker2id.items(): + speaker2id_str += f"{sep}{s}->{i}" + id2speaker_str += f"{sep}{i}->{s}" + sep = "," + + meta_data = { + "model_type": "kokoro", + "language": "multi-lang, e.g., English, Chinese", + "has_espeak": 1, + "sample_rate": 24000, + "version": 2, + "voice": "en-us", + "style_dim": ",".join(map(str, style.shape)), + "n_speakers": len(speaker2id), + "id2speaker": id2speaker_str, + "speaker2id": speaker2id_str, + "speaker_names": ",".join(map(str, speaker2id.keys())), + "model_url": "https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh", + "maintainer": "k2-fsa", + "comment": "This is Kokoro v1.1-zh, a multilingual TTS model, supporting English, Chinese.", + } + + print(model.metadata_props) + + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + print("--------------------") + + print(model.metadata_props) + + onnx.save(model, "./kokoro.onnx") + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.1-zh/dynamic_quantization.py b/scripts/kokoro/v1.1-zh/dynamic_quantization.py new file mode 100755 index 00000000..1e4c1e53 --- /dev/null +++ b/scripts/kokoro/v1.1-zh/dynamic_quantization.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +import argparse + +import onnxruntime +from onnxruntime.quantization import QuantType, quantize_dynamic + + +def show(filename): + session_opts = onnxruntime.SessionOptions() + session_opts.log_severity_level = 3 + sess = onnxruntime.InferenceSession(filename, session_opts) + for i in sess.get_inputs(): + print(i) + + print("-----") + + for i in sess.get_outputs(): + print(i) + + +""" +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length']) +NodeArg(name='style', type='tensor(float)', shape=[1, 256]) +NodeArg(name='speed', type='tensor(float)', shape=[1]) +----- +NodeArg(name='audio', type='tensor(float)', shape=['audio_length']) +""" + + +def main(): + show("./kokoro.onnx") + + quantize_dynamic( + model_input="kokoro.onnx", + model_output="kokoro.int8.onnx", + # op_types_to_quantize=["MatMul"], + weight_type=QuantType.QUInt8, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.1-zh/export_onnx.py b/scripts/kokoro/v1.1-zh/export_onnx.py new file mode 100644 index 00000000..1c019686 --- /dev/null +++ b/scripts/kokoro/v1.1-zh/export_onnx.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + +import json + +import torch +from kokoro import KModel +from kokoro.model import KModelForONNX + + +@torch.no_grad() +def main(): + with open("config.json") as f: + config = json.load(f) + + model = ( + KModel( + repo_id="not-used-any-value-is-ok", + model="kokoro-v1_1-zh.pth", + config=config, + disable_complex=True, + ) + .to("cpu") + .eval() + ) + + x = torch.randint(1, 100, (48,)).numpy() + x = torch.LongTensor([[0, *x, 0]]) + + style = torch.rand(1, 256, dtype=torch.float32) + speed = torch.rand(1) + + print(x.shape, x.dtype) + print(style.shape, style.dtype) + print(speed, speed.dtype) + + model2 = KModelForONNX(model) + + torch.onnx.export( + model2, + (x, style, speed), + "kokoro.onnx", + input_names=["tokens", "style", "speed"], + output_names=["audio"], + dynamic_axes={ + "tokens": {1: "sequence_length"}, + "audio": {0: "audio_length"}, + }, + opset_version=14, # minimum working version for this kokoro model is 14 + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.1-zh/generate_lexicon_en.py b/scripts/kokoro/v1.1-zh/generate_lexicon_en.py new file mode 120000 index 00000000..e5909fdb --- /dev/null +++ b/scripts/kokoro/v1.1-zh/generate_lexicon_en.py @@ -0,0 +1 @@ +../v1.0/generate_lexicon_en.py \ No newline at end of file diff --git a/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py b/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py new file mode 100755 index 00000000..c2f222c3 --- /dev/null +++ b/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import re +from typing import List, Tuple + +from misaki import zh +from misaki.token import MToken +from misaki.zh_frontend import ZH_MAP +from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict + +user_dict = { + "还田": [["huan2"], ["tian2"]], + "行长": [["hang2"], ["zhang3"]], + "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]], +} + +load_phrases_dict(user_dict) + +phrases_dict.phrases_dict.update(**user_dict) + + +def process_text(self, text, with_erhua=True): + """ + This function is modified from + https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155 + + Note that we have removed jieba.posseg.lcut(). + """ + seg_cut = [(text, "v")] + seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut) + tokens = [] + seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut) + initials = [] + finals = [] + # pypinyin, g2pM + for word, pos in seg_cut: + if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF": + pos = "X" + elif pos != "x" and word in self.punc: + pos = "x" + tk = MToken(text=word, tag=pos, whitespace="") + if pos in ("x", "eng"): + if not word.isspace(): + if pos == "x" and word in self.punc: + tk.phonemes = word + tokens.append(tk) + elif tokens: + tokens[-1].whitespace += word + continue + elif ( + tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace + ): + tokens[-1].whitespace = "/" + + # g2p + sub_initials, sub_finals = self._get_initials_finals(word) + # tone sandhi + sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals) + # er hua + if with_erhua: + sub_initials, sub_finals = self._merge_erhua( + sub_initials, sub_finals, word, pos + ) + + initials.append(sub_initials) + finals.append(sub_finals) + # assert len(sub_initials) == len(sub_finals) == len(word) + + # sum(iterable[, start]) + # initials = sum(initials, []) + # finals = sum(finals, []) + + phones = [] + for c, v in zip(sub_initials, sub_finals): + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c: + phones.append(c) + # replace punctuation by ` ` + # if c and c in self.punc: + # phones.append(c) + if v and (v not in self.punc or v != c): # and v not in self.rhy_phns: + phones.append(v) + phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R") + phones = re.sub(r"(?=\d)", "_", phones).split("_") + tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones) + tokens.append(tk) + + result = "".join( + (self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace + for tk in tokens + ) + + return result, tokens + + +def generate_chinese_lexicon(): + word_dict = pinyin_dict.pinyin_dict + phrases = phrases_dict.phrases_dict + + g2p = zh.ZHG2P(version="1.1") + + lexicon = [] + for key in word_dict: + if not (0x4E00 <= key <= 0x9FFF): + continue + w = chr(key) + tokens: str = process_text(g2p.frontend, w)[0] + lexicon.append((w, tokens)) + + for key in phrases: + tokens: str = process_text(g2p.frontend, key)[0] + lexicon.append((key, tokens)) + return lexicon + + +def save(filename: str, lexicon: List[Tuple[str, str]]): + with open(filename, "w", encoding="utf-8") as f: + for word, phones in lexicon: + tokens = " ".join(list(phones)) + f.write(f"{word} {tokens}\n") + + +def main(): + zh = generate_chinese_lexicon() + + save("lexicon-zh.txt", zh) + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.1-zh/generate_tokens.py b/scripts/kokoro/v1.1-zh/generate_tokens.py new file mode 120000 index 00000000..cc461cd4 --- /dev/null +++ b/scripts/kokoro/v1.1-zh/generate_tokens.py @@ -0,0 +1 @@ +../v1.0/generate_tokens.py \ No newline at end of file diff --git a/scripts/kokoro/v1.1-zh/generate_voices_bin.py b/scripts/kokoro/v1.1-zh/generate_voices_bin.py new file mode 100755 index 00000000..d347aeed --- /dev/null +++ b/scripts/kokoro/v1.1-zh/generate_voices_bin.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) +import torch +from pathlib import Path + + +speakers = [ + "af_maple", + "af_sol", + "bf_vale", +] +for i in range(1, 99 + 1): + name = "zf_{:03d}".format(i) + if Path(f"voices/{name}.pt").is_file(): + speakers.append(name) + +for i in range(9, 100 + 1): + name = "zm_{:03d}".format(i) + if Path(f"voices/{name}.pt").is_file(): + speakers.append(name) + + +id2speaker = {index: value for index, value in enumerate(speakers)} + +speaker2id = {speaker: idx for idx, speaker in id2speaker.items()} + + +def main(): + if Path("./voices.bin").is_file(): + print("./voices.bin exists - skip") + return + + with open("voices.bin", "wb") as f: + for _, speaker in id2speaker.items(): + m = torch.load( + f"voices/{speaker}.pt", + weights_only=True, + map_location="cpu", + ).numpy() + # m.shape (510, 1, 256) + + f.write(m.tobytes()) + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.1-zh/run.sh b/scripts/kokoro/v1.1-zh/run.sh new file mode 100755 index 00000000..09ea2112 --- /dev/null +++ b/scripts/kokoro/v1.1-zh/run.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) +# +set -ex + +if [ ! -f kokoro-v1_1-zh.pth ]; then + curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/kokoro-v1_1-zh.pth +fi + + +if [ ! -f config.json ]; then + # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json + curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json +fi + +voices=( +af_maple +af_sol +bf_vale +) +# zf_001-zf_099 +for i in $(seq 1 99); do + a=$(printf "zf_%03d" $i) + voices+=($a) +done + +# zm_009-zm_100 +for i in $(seq 9 100); do + a=$(printf "zm_%03d" $i) + voices+=($a) +done + +echo ${voices[@]} # all elements +echo ${#voices[@]} # length + +mkdir -p voices + +for v in ${voices[@]}; do + if [ ! -f voices/$v.pt ]; then + curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/voices/$v.pt + fi +done +pushd voices +find . -type f -size -10k -exec rm -v {} + +ls -lh +du -h -d1 . +popd + +if [ ! -f ./kokoro.onnx ]; then + python3 ./export_onnx.py +fi + +if [ ! -f ./.add-meta-data.done ]; then + python3 ./add_meta_data.py + touch ./.add-meta-data.done +fi + +if [ ! -f ./kokoro.int8.onnx ]; then + python3 ./dynamic_quantization.py +fi + +if [ ! -f us_gold.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json +fi + +if [ ! -f us_silver.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json +fi + +if [ ! -f gb_gold.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json +fi + +if [ ! -f gb_silver.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json +fi + +if [ ! -f ./tokens.txt ]; then + ./generate_tokens.py +fi + +if [ ! -f ./lexicon-zh.txt ]; then + ./generate_lexicon_zh.py +fi + +if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then + ./generate_lexicon_en.py +fi + +if [ ! -f ./voices.bin ]; then + ./generate_voices_bin.py +fi + +./test.py +ls -lh diff --git a/scripts/kokoro/v1.1-zh/test.py b/scripts/kokoro/v1.1-zh/test.py new file mode 100755 index 00000000..56be9edd --- /dev/null +++ b/scripts/kokoro/v1.1-zh/test.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import re +import time +from typing import Dict, List + +import jieba +import numpy as np +import onnxruntime as ort +import soundfile as sf + +try: + from piper_phonemize import phonemize_espeak +except Exception as ex: + raise RuntimeError( + f"{ex}\nPlease run\n" + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html" + ) + + +def show(filename): + session_opts = ort.SessionOptions() + session_opts.log_severity_level = 3 + sess = ort.InferenceSession(filename, session_opts) + for i in sess.get_inputs(): + print(i) + + print("-----") + + for i in sess.get_outputs(): + print(i) + + +""" +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length']) +NodeArg(name='style', type='tensor(float)', shape=[1, 256]) +NodeArg(name='speed', type='tensor(float)', shape=[1]) +----- +NodeArg(name='audio', type='tensor(float)', shape=['audio_length']) +""" + + +def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str): + embedding = ( + np.fromfile(voices_bin, dtype="uint8") + .view(np.float32) + .reshape(len(speaker_names), *dim) + ) + print("embedding.shape", embedding.shape) + ans = dict() + for i in range(len(speaker_names)): + ans[speaker_names[i]] = embedding[i] + + return ans + + +def load_tokens(filename: str) -> Dict[str, int]: + ans = dict() + with open(filename, encoding="utf-8") as f: + for line in f: + fields = line.strip().split() + if len(fields) == 2: + token, idx = fields + ans[token] = int(idx) + else: + assert len(fields) == 1, (len(fields), line) + ans[" "] = int(fields[0]) + return ans + + +def load_lexicon(filename: str) -> Dict[str, List[str]]: + ans = dict() + for lexicon in filename.split(","): + print(lexicon) + with open(lexicon, encoding="utf-8") as f: + for line in f: + w, tokens = line.strip().split(" ", maxsplit=1) + ans[w] = "".join(tokens.split()) + return ans + + +class OnnxModel: + def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 3 + session_opts.intra_op_num_threads = 3 + + self.session_opts = session_opts + self.model = ort.InferenceSession( + model_filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + self.token2id = load_tokens(tokens) + self.word2tokens = load_lexicon(lexicon) + + meta = self.model.get_modelmeta().custom_metadata_map + print(meta) + dim = list(map(int, meta["style_dim"].split(","))) + speaker_names = meta["speaker_names"].split(",") + self.voices = load_voices( + speaker_names=speaker_names, dim=dim, voices_bin=voices_bin + ) + self.sample_rate = int(meta["sample_rate"]) + print(list(self.voices.keys())) + + self.sample_rate = 24000 + self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1 + + def __call__(self, text: str, voice: str): + punctuations = ';:,.!?-…()"“”' + text = text.lower() + + tokens = "" + + for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text): + if ord(t[0]) < 0x7F: + for w in t.split(): + while w: + if w[0] in punctuations: + tokens += w[0] + " " + w = w[1:] + continue + + if w[-1] in punctuations: + if w[:-1] in self.word2tokens: + tokens += self.word2tokens[w[:-1]] + tokens += w[-1] + else: + if w in self.word2tokens: + tokens += self.word2tokens[w] + else: + print(f"Use espeak-ng for word {w}") + tokens += "".join(phonemize_espeak(w, "en-us")[0]) + + tokens += " " + break + else: + # Chinese + for w in jieba.cut(t): + if w in self.word2tokens: + tokens += self.word2tokens[w] + else: + for i in w: + if i in self.word2tokens: + tokens += self.word2tokens[i] + else: + print(f"skip {i}") + + token_ids = [self.token2id[i] for i in tokens] + token_ids = token_ids[: self.max_len] + + style = self.voices[voice][len(token_ids)] + + token_ids = [0, *token_ids, 0] + token_ids = np.array([token_ids], dtype=np.int64) + + speed = np.array([1.0], dtype=np.float32) + + audio = self.model.run( + [ + self.model.get_outputs()[0].name, + ], + { + self.model.get_inputs()[0].name: token_ids, + self.model.get_inputs()[1].name: style, + self.model.get_inputs()[2].name: speed, + }, + )[0] + return audio + + +def main(): + m = OnnxModel( + model_filename="./kokoro.onnx", + tokens="./tokens.txt", + lexicon="./lexicon-us-en.txt,./lexicon-zh.txt", + voices_bin="./voices.bin", + ) + text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?" + + text = text.lower() + + voice = "zf_001" + start = time.time() + audio = m(text, voice=voice) + end = time.time() + + elapsed_seconds = end - start + audio_duration = len(audio) / m.sample_rate + real_time_factor = elapsed_seconds / audio_duration + + filename = f"kokoro_v1.1_{voice}_zh_en.wav" + sf.write( + filename, + audio, + samplerate=m.sample_rate, + subtype="PCM_16", + ) + print(f" Saved to {filename}") + print(f" Elapsed seconds: {elapsed_seconds:.3f}") + print(f" Audio duration in seconds: {audio_duration:.3f}") + print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +if __name__ == "__main__": + main() diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc index a26df51a..382cfc58 100644 --- a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc +++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc @@ -128,15 +128,19 @@ class KokoroMultiLangLexicon::Impl { } for (const auto &ids : ids_vec) { - if (ids.size() > 4) { + if (ids.size() > 10 + 2) { ans.emplace_back(ids); } else { if (ans.empty()) { ans.emplace_back(ids); } else { - ans.back().tokens.back() = ids[1]; - ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2, - ids.end()); + if (ans.back().tokens.size() + ids.size() < 50) { + ans.back().tokens.back() = ids[1]; + ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2, + ids.end()); + } else { + ans.emplace_back(ids); + } } } } diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc index d3fa14b5..a5c3a262 100644 --- a/sherpa-onnx/csrc/offline-tts.cc +++ b/sherpa-onnx/csrc/offline-tts.cc @@ -33,8 +33,8 @@ GeneratedAudio GeneratedAudio::ScaleSilence(float scale) const { if (scale == 1) { return *this; } - // if the interval is larger than 0.6 second, then we assume it is a pause - int32_t threshold = static_cast(sample_rate * 0.6); + // if the interval is larger than 0.2 second, then we assume it is a pause + int32_t threshold = static_cast(sample_rate * 0.2); std::vector intervals; int32_t num_samples = static_cast(samples.size());