diff --git a/.github/workflows/export-kokoro.yaml b/.github/workflows/export-kokoro.yaml index 6429c7b0..adbc8bb1 100644 --- a/.github/workflows/export-kokoro.yaml +++ b/.github/workflows/export-kokoro.yaml @@ -4,6 +4,7 @@ on: push: branches: - export-kokoro + - kokoro-1.0-2 workflow_dispatch: @@ -14,12 +15,13 @@ concurrency: jobs: export-kokoro-to-onnx: if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' - name: export kokoro + name: export kokoro ${{ matrix.version }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [ubuntu-latest] + version: ["0.19", "1.0"] python-version: ["3.10"] steps: @@ -33,7 +35,7 @@ jobs: - name: Install Python dependencies shell: bash run: | - pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html + pip install "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch - name: Run shell: bash @@ -42,9 +44,16 @@ jobs: tar xf espeak-ng-data.tar.bz2 rm espeak-ng-data.tar.bz2 cd scripts/kokoro - ./run.sh + v=${{ matrix.version }} + if [[ $v = "0.19" ]]; then + ./run.sh + elif [[ $v == "1.0" ]]; then + cd v1.0 + ./run.sh + fi - - name: Collect results + - name: Collect results ${{ matrix.version }} + if: matrix.version == '0.19' shell: bash run: | src=scripts/kokoro @@ -53,7 +62,7 @@ jobs: mkdir $d cp -a LICENSE $d/LICENSE cp -a espeak-ng-data $d/ - cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx + cp -v $src/kokoro-v0_19.onnx $d/model.onnx cp -v $src/voices.bin $d/ cp -v $src/tokens.txt $d/ cp -v $src/README-new.md $d/README.md @@ -61,9 +70,31 @@ jobs: tar cjfv $d.tar.bz2 $d rm -rf $d - ls -h $.tar.bz2 + ls -lh $d.tar.bz2 - - name: Publish to huggingface + - name: Collect results ${{ matrix.version }} + if: matrix.version == '1.0' + shell: bash + run: | + src=scripts/kokoro/v1.0 + + d=kokoro-multi-lang-v1_0 + mkdir $d + cp -a LICENSE $d/LICENSE + cp -a espeak-ng-data $d/ + cp -v $src/kokoro.onnx $d/model.onnx + cp -v $src/voices.bin $d/ + cp -v $src/tokens.txt $d/ + cp -v $src/lexicon*.txt $d/ + cp -v $src/README.md $d/README.md + ls -lh $d/ + tar cjfv $d.tar.bz2 $d + rm -rf $d + + ls -lh $d.tar.bz2 + + - name: Publish to huggingface ${{ matrix.version }} + if: matrix.version == '0.19' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 @@ -92,9 +123,9 @@ jobs: cp -a ../espeak-ng-data ./ mkdir -p test_wavs - cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx + cp -v ../scripts/kokoro/kokoro-v0_19.onnx ./model.onnx - cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/ + cp -v ../scripts/kokoro/kokoro-v0_19-*.wav ./test_wavs/ cp -v ../scripts/kokoro/tokens.txt . cp -v ../scripts/kokoro/voices.bin . @@ -111,6 +142,55 @@ jobs: git commit -m "add models" git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true + - name: Publish to huggingface ${{ matrix.version }} + if: matrix.version == '1.0' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 huggingface + cd huggingface + rm -rf ./* + git fetch + git pull + + git lfs track "cmn_dict" + git lfs track "ru_dict" + git lfs track "*.wav" + git lfs track "lexicon*.txt" + + cp -a ../espeak-ng-data ./ + + cp -v ../scripts/kokoro/v1.0/kokoro.onnx ./model.onnx + + + cp -v ../scripts/kokoro/v1.0/tokens.txt . + cp -v ../scripts/kokoro/v1.0/voices.bin . + cp -v ../scripts/kokoro/v1.0/lexicon*.txt . + cp -v ../scripts/kokoro/v1.0/README.md ./README.md + cp -v ../LICENSE ./ + + git lfs track "*.onnx" + git add . + + ls -lh + + git status + + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-multi-lang-v1_0 main || true + - name: Release uses: svenstaro/upload-release-action@v2 with: diff --git a/.gitignore b/.gitignore index da3399c5..2840d8ff 100644 --- a/.gitignore +++ b/.gitignore @@ -128,3 +128,7 @@ harmony-os/SherpaOnnxHar/sherpa_onnx/CHANGELOG.md matcha-icefall-zh-baker matcha-icefall-en_US-ljspeech kokoro-en-v0_19 +*.pt +lexicon.txt +us_gold.json +us_silver.json diff --git a/scripts/kokoro/add-meta-data.py b/scripts/kokoro/add_meta_data.py similarity index 91% rename from scripts/kokoro/add-meta-data.py rename to scripts/kokoro/add_meta_data.py index cf42f13f..5dfa74d8 100755 --- a/scripts/kokoro/add-meta-data.py +++ b/scripts/kokoro/add_meta_data.py @@ -69,6 +69,14 @@ def main(): for k in keys: f.write(voices[k].tobytes()) + speaker2id_str = "" + id2speaker_str = "" + sep = "" + for i, s in enumerate(keys): + speaker2id_str += f"{sep}{s}->{i}" + id2speaker_str += f"{sep}{i}->{s}" + sep = "," + meta_data = { "model_type": "kokoro", "language": "English", @@ -78,6 +86,8 @@ def main(): "voice": "en-us", "style_dim": ",".join(map(str, voices[keys[0]].shape)), "n_speakers": len(keys), + "speaker2id": speaker2id_str, + "id2speaker": id2speaker_str, "speaker_names": ",".join(keys), "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files", "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS", diff --git a/scripts/kokoro/run.sh b/scripts/kokoro/run.sh index 422472ca..e1fae471 100755 --- a/scripts/kokoro/run.sh +++ b/scripts/kokoro/run.sh @@ -16,8 +16,8 @@ https://huggingface.co/hexgrad/Kokoro-82M/discussions/14 EOF files=( -kokoro-v0_19_hf.onnx -# kokoro-v0_19.onnx +# kokoro-v0_19_hf.onnx +kokoro-v0_19.onnx # kokoro-quant.onnx # kokoro-quant-convinteger.onnx voices.json @@ -30,14 +30,14 @@ for f in ${files[@]}; do done models=( -# kokoro-v0_19 +kokoro-v0_19 # kokoro-quant # kokoro-quant-convinteger -kokoro-v0_19_hf +# kokoro-v0_19_hf ) for m in ${models[@]}; do - ./add-meta-data.py --model $m.onnx --voices ./voices.json + ./add_meta_data.py --model $m.onnx --voices ./voices.json done ls -l diff --git a/scripts/kokoro/v1.0/.gitignore b/scripts/kokoro/v1.0/.gitignore new file mode 100644 index 00000000..ddbf883b --- /dev/null +++ b/scripts/kokoro/v1.0/.gitignore @@ -0,0 +1,5 @@ +config.json +*.json +*.txt +.add-meta-data.done +voices diff --git a/scripts/kokoro/v1.0/README.md b/scripts/kokoro/v1.0/README.md new file mode 100644 index 00000000..e7e3465c --- /dev/null +++ b/scripts/kokoro/v1.0/README.md @@ -0,0 +1,3 @@ +# Introduction + +This directory is for kokoro v1.0 diff --git a/scripts/kokoro/v1.0/__init__.py b/scripts/kokoro/v1.0/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/kokoro/v1.0/add_meta_data.py b/scripts/kokoro/v1.0/add_meta_data.py new file mode 100755 index 00000000..114dfe7b --- /dev/null +++ b/scripts/kokoro/v1.0/add_meta_data.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import argparse +import json +from pathlib import Path + +import numpy as np +import onnx +import torch + +from generate_voices_bin import speaker2id + + +def main(): + model = onnx.load("./kokoro.onnx") + style = torch.load("./voices/af_alloy.pt", weights_only=True, map_location="cpu") + + id2speaker_str = "" + speaker2id_str = "" + sep = "" + for s, i in speaker2id.items(): + speaker2id_str += f"{sep}{s}->{i}" + id2speaker_str += f"{sep}{i}->{s}" + sep = "," + + meta_data = { + "model_type": "kokoro", + "language": "English", + "has_espeak": 1, + "sample_rate": 24000, + "version": 2, + "voice": "en-us", + "style_dim": ",".join(map(str, style.shape)), + "n_speakers": len(speaker2id), + "id2speaker": id2speaker_str, + "speaker2id": speaker2id_str, + "speaker_names": ",".join(map(str, speaker2id.keys())), + "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files", + "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS", + "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M", + "maintainer": "k2-fsa", + "comment": "This is Kokoro v1.0, a multilingual TTS model, supporting English, Chinese, French, Japanese etc.", + } + + print(model.metadata_props) + + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + print("--------------------") + + print(model.metadata_props) + + onnx.save(model, "./kokoro.onnx") + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.0/generate_lexicon.py b/scripts/kokoro/v1.0/generate_lexicon.py new file mode 100755 index 00000000..a7ad46f5 --- /dev/null +++ b/scripts/kokoro/v1.0/generate_lexicon.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +import json +from pypinyin import phrases_dict, pinyin_dict +from misaki import zh +from typing import List, Tuple + + +def generate_english_lexicon(kind: str): + assert kind in ("us", "gb"), kind + # If you want to add new words, please add them to + # the user_defined dict. + user_defined = { + "Kokoro": "kˈOkəɹO", + "Misaki": "misˈɑki", + } + + user_defined_lower = dict() + for k, v in user_defined.items(): + user_defined_lower[k.lower()] = v + + with open(f"./{kind}_gold.json", encoding="utf-8") as f: + gold = json.load(f) + + with open(f"./{kind}_silver.json", encoding="utf-8") as f: + silver = json.load(f) + + # words in us_gold has a higher priority than those in s_silver, so + # we put us_gold after us_silver below + english = {**silver, **gold} + + lexicon = dict() + for k, v in english.items(): + k_lower = k.lower() + + if k_lower in user_defined_lower: + print(f"{k} already exist in the user defined dict. Skip adding") + continue + + if isinstance(v, str): + lexicon[k_lower] = v + else: + assert isinstance(v, dict), (k, v) + assert "DEFAULT" in v, (k, v) + lexicon[k_lower] = v["DEFAULT"] + + return list(user_defined_lower.items()) + list(lexicon.items()) + + +def generate_chinese_lexicon(): + word_dict = pinyin_dict.pinyin_dict + phrases = phrases_dict.phrases_dict + + g2p = zh.ZHG2P() + lexicon = [] + + for key in word_dict: + if not (0x4E00 <= key <= 0x9FFF): + continue + w = chr(key) + tokens: str = g2p(w) + lexicon.append((w, tokens)) + + for key in phrases: + tokens: str = g2p(key) + lexicon.append((key, tokens)) + return lexicon + + +def save(filename: str, lexicon: List[Tuple[str, str]]): + with open(filename, "w", encoding="utf-8") as f: + for word, phones in lexicon: + tokens = " ".join(list(phones)) + f.write(f"{word} {tokens}\n") + + +def main(): + us = generate_english_lexicon("us") + gb = generate_english_lexicon("gb") + zh = generate_chinese_lexicon() + + save("lexicon-us-en.txt", us) + save("lexicon-gb-en.txt", gb) + save("lexicon-zh.txt", zh) + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.0/generate_tokens.py b/scripts/kokoro/v1.0/generate_tokens.py new file mode 100755 index 00000000..5c93ef5d --- /dev/null +++ b/scripts/kokoro/v1.0/generate_tokens.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import json + + +def main(): + with open("config.json") as f: + config = json.load(f) + vocab = config["vocab"] + + with open("tokens.txt", "w", encoding="utf-8") as f: + for k, i in vocab.items(): + f.write(f"{k} {i}\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.0/generate_voices_bin.py b/scripts/kokoro/v1.0/generate_voices_bin.py new file mode 100755 index 00000000..84d1d20d --- /dev/null +++ b/scripts/kokoro/v1.0/generate_voices_bin.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) +import torch +from pathlib import Path + + +id2speaker = { + 0: "af_alloy", + 1: "af_aoede", + 2: "af_bella", + 3: "af_heart", + 4: "af_jessica", + 5: "af_kore", + 6: "af_nicole", + 7: "af_nova", + 8: "af_river", + 9: "af_sarah", + 10: "af_sky", + 11: "am_adam", + 12: "am_echo", + 13: "am_eric", + 14: "am_fenrir", + 15: "am_liam", + 16: "am_michael", + 17: "am_onyx", + 18: "am_puck", + 19: "am_santa", + 20: "bf_alice", + 21: "bf_emma", + 22: "bf_isabella", + 23: "bf_lily", + 24: "bm_daniel", + 25: "bm_fable", + 26: "bm_george", + 27: "bm_lewis", + 28: "ef_dora", + 29: "em_alex", + 30: "ff_siwis", + 31: "hf_alpha", + 32: "hf_beta", + 33: "hm_omega", + 34: "hm_psi", + 35: "if_sara", + 36: "im_nicola", + 37: "jf_alpha", + 38: "jf_gongitsune", + 39: "jf_nezumi", + 40: "jf_tebukuro", + 41: "jm_kumo", + 42: "pf_dora", + 43: "pm_alex", + 44: "pm_santa", + 45: "zf_xiaobei", + 46: "zf_xiaoni", + 47: "zf_xiaoxiao", + 48: "zf_xiaoyi", + 49: "zm_yunjian", + 50: "zm_yunxi", + 51: "zm_yunxia", + 52: "zm_yunyang", +} + +speaker2id = {speaker: idx for idx, speaker in id2speaker.items()} + + +def main(): + if Path("./voices.bin").is_file(): + print("./voices.bin exists - skip") + return + + with open("voices.bin", "wb") as f: + for _, speaker in id2speaker.items(): + m = torch.load( + f"{speaker}.pt", + weights_only=True, + map_location="cpu", + ).numpy() + # m.shape (510, 1, 256) + + f.write(m.tobytes()) + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/v1.0/run.sh b/scripts/kokoro/v1.0/run.sh new file mode 100755 index 00000000..47aefdbc --- /dev/null +++ b/scripts/kokoro/v1.0/run.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +set -ex + +if [ ! -f kokoro.onnx ]; then + # see https://github.com/taylorchu/kokoro-onnx/releases + curl -SL -O https://github.com/taylorchu/kokoro-onnx/releases/download/v0.2.0/kokoro.onnx +fi + +if [ ! -f config.json ]; then + # see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json + curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/config.json +fi + +# see https://huggingface.co/spaces/hexgrad/Kokoro-TTS/blob/main/app.py#L83 +# and +# https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices +# +# af -> American female +# am -> American male +# bf -> British female +# bm -> British male +voices=( +af_alloy +af_aoede +af_bella +af_heart +af_jessica +af_kore +af_nicole +af_nova +af_river +af_sarah +af_sky +am_adam +am_echo +am_eric +am_fenrir +am_liam +am_michael +am_onyx +am_puck +am_santa +bf_alice +bf_emma +bf_isabella +bf_lily +bm_daniel +bm_fable +bm_george +bm_lewis +ef_dora +em_alex +ff_siwis +hf_alpha +hf_beta +hm_omega +hm_psi +if_sara +im_nicola +jf_alpha +jf_gongitsune +jf_nezumi +jf_tebukuro +jm_kumo +pf_dora +pm_alex +pm_santa +zf_xiaobei # 东北话 +zf_xiaoni +zf_xiaoxiao +zf_xiaoyi +zm_yunjian +zm_yunxi +zm_yunxia +zm_yunyang +) + +mkdir -p voices + +for v in ${voices[@]}; do + if [ ! -f voices/$v.pt ]; then + curl -SL --output voices/$v.pt https://huggingface.co/hexgrad/Kokoro-82M/resolve/main/voices/$v.pt + fi +done + +if [ ! -f ./.add-meta-data.done ]; then + python3 ./add_meta_data.py + touch ./.add-meta-data.done +fi + +if [ ! -f us_gold.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_gold.json +fi + +if [ ! -f us_silver.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/us_silver.json +fi + +if [ ! -f gb_gold.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_gold.json +fi + +if [ ! -f gb_silver.json ]; then + curl -SL -O https://raw.githubusercontent.com/hexgrad/misaki/refs/heads/main/misaki/data/gb_silver.json +fi + +if [ ! -f ./tokens.txt ]; then + ./generate_tokens.py +fi + +if [ ! -f ./lexicon.txt ]; then + ./generate_lexicon.py +fi + +if [ ! -f ./voices.bin ]; then + ./generate_voices_bin.py +fi + +./test.py +ls -lh diff --git a/scripts/kokoro/v1.0/test.py b/scripts/kokoro/v1.0/test.py new file mode 100755 index 00000000..304a810e --- /dev/null +++ b/scripts/kokoro/v1.0/test.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import re +import time +from typing import Dict, List + +import jieba +import numpy as np +import onnxruntime as ort +import soundfile as sf +import torch +from misaki import zh + +try: + from piper_phonemize import phonemize_espeak +except Exception as ex: + raise RuntimeError( + f"{ex}\nPlease run\n" + "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html" + ) + + +def show(filename): + session_opts = ort.SessionOptions() + session_opts.log_severity_level = 3 + sess = ort.InferenceSession(filename, session_opts) + for i in sess.get_inputs(): + print(i) + + print("-----") + + for i in sess.get_outputs(): + print(i) + + +""" +NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'sequence_length']) +NodeArg(name='style', type='tensor(float)', shape=[1, 256]) +NodeArg(name='speed', type='tensor(float)', shape=[1]) +----- +NodeArg(name='audio', type='tensor(float)', shape=['audio_length']) +""" + + +def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str): + embedding = ( + np.fromfile(voices_bin, dtype="uint8") + .view(np.float32) + .reshape(len(speaker_names), *dim) + ) + print("embedding.shape", embedding.shape) + ans = dict() + for i in range(len(speaker_names)): + ans[speaker_names[i]] = embedding[i] + + return ans + + +def load_tokens(filename: str) -> Dict[str, int]: + ans = dict() + with open(filename, encoding="utf-8") as f: + for line in f: + fields = line.strip().split() + if len(fields) == 2: + token, idx = fields + ans[token] = int(idx) + else: + assert len(fields) == 1, (len(fields), line) + ans[" "] = int(fields[0]) + return ans + + +def load_lexicon(filename: str) -> Dict[str, List[str]]: + ans = dict() + for lexicon in filename.split(","): + print(lexicon) + with open(lexicon, encoding="utf-8") as f: + for line in f: + w, tokens = line.strip().split(" ", maxsplit=1) + ans[w] = "".join(tokens.split()) + return ans + + +class OnnxModel: + def __init__(self, model_filename: str, tokens: str, lexicon: str, voices_bin: str): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + self.model = ort.InferenceSession( + model_filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + self.token2id = load_tokens(tokens) + self.word2tokens = load_lexicon(lexicon) + + meta = self.model.get_modelmeta().custom_metadata_map + print(meta) + dim = list(map(int, meta["style_dim"].split(","))) + speaker_names = meta["speaker_names"].split(",") + self.voices = load_voices( + speaker_names=speaker_names, dim=dim, voices_bin=voices_bin + ) + self.sample_rate = int(meta["sample_rate"]) + print(list(self.voices.keys())) + + self.sample_rate = 24000 + self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1 + + def __call__(self, text: str, voice: str): + punctuations = ';:,.!?-…()"“”' + text = text.lower() + g2p = zh.ZHG2P() + + tokens = "" + + for t in re.findall("[\u4E00-\u9FFF]+|[\u0000-\u007f]+", text): + if ord(t[0]) < 0x7F: + for w in t.split(): + while w: + if w[0] in punctuations: + tokens += w[0] + " " + w = w[1:] + continue + + if w[-1] in punctuations: + if w[:-1] in self.word2tokens: + tokens += self.word2tokens[w[:-1]] + tokens += w[-1] + else: + if w in self.word2tokens: + tokens += self.word2tokens[w] + else: + print(f"Use espeak-ng for word {w}") + tokens += "".join(phonemize_espeak(w, "en-us")[0]) + + tokens += " " + break + else: + # Chinese + for w in jieba.cut(t): + if w in self.word2tokens: + tokens += self.word2tokens[w] + else: + for i in w: + if i in self.word2tokens: + tokens += self.word2tokens[i] + else: + print(f"skip {i}") + + token_ids = [self.token2id[i] for i in tokens] + token_ids = token_ids[: self.max_len] + + style = self.voices[voice][len(token_ids)] + + token_ids = [0, *token_ids, 0] + token_ids = np.array([token_ids], dtype=np.int64) + + speed = np.array([1.0], dtype=np.float32) + + audio = self.model.run( + [ + self.model.get_outputs()[0].name, + ], + { + self.model.get_inputs()[0].name: token_ids, + self.model.get_inputs()[1].name: style, + self.model.get_inputs()[2].name: speed, + }, + )[0] + return audio + + +def main(): + m = OnnxModel( + model_filename="./kokoro.onnx", + tokens="./tokens.txt", + lexicon="./lexicon-gb-en.txt,./lexicon-zh.txt", + voices_bin="./voices.bin", + ) + text = "来听一听, 这个是什么口音? How are you doing? Are you ok? Thank you! 你觉得中英文说得如何呢?" + + text = text.lower() + + voice = "bf_alice" + start = time.time() + audio = m(text, voice=voice) + end = time.time() + + elapsed_seconds = end - start + audio_duration = len(audio) / m.sample_rate + real_time_factor = elapsed_seconds / audio_duration + + filename = f"kokoro_v1.0_{voice}_zh_en.wav" + sf.write( + filename, + audio, + samplerate=m.sample_rate, + subtype="PCM_16", + ) + print(f" Saved to {filename}") + print(f" Elapsed seconds: {elapsed_seconds:.3f}") + print(f" Audio duration in seconds: {audio_duration:.3f}") + print(f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") + + +if __name__ == "__main__": + main()