From 9efe26a64624ed0ab71cb0dd98e23b53f4874a47 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 15 Jan 2025 16:49:10 +0800 Subject: [PATCH] Export kokoro to sherpa-onnx (#1713) --- .github/workflows/export-kokoro.yaml | 122 +++++++++++++++ scripts/kokoro/.gitignore | 3 + scripts/kokoro/README.md | 10 ++ scripts/kokoro/add-meta-data.py | 107 +++++++++++++ scripts/kokoro/run.sh | 50 ++++++ scripts/kokoro/test.py | 223 +++++++++++++++++++++++++++ scripts/melo-tts/export-onnx.py | 2 +- 7 files changed, 516 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/export-kokoro.yaml create mode 100644 scripts/kokoro/.gitignore create mode 100644 scripts/kokoro/README.md create mode 100755 scripts/kokoro/add-meta-data.py create mode 100755 scripts/kokoro/run.sh create mode 100755 scripts/kokoro/test.py diff --git a/.github/workflows/export-kokoro.yaml b/.github/workflows/export-kokoro.yaml new file mode 100644 index 00000000..6429c7b0 --- /dev/null +++ b/.github/workflows/export-kokoro.yaml @@ -0,0 +1,122 @@ +name: export-kokoro-to-onnx + +on: + push: + branches: + - export-kokoro + + workflow_dispatch: + +concurrency: + group: export-kokoro-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-kokoro-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: export kokoro + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + shell: bash + run: | + pip install -q "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html + + - name: Run + shell: bash + run: | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2 + tar xf espeak-ng-data.tar.bz2 + rm espeak-ng-data.tar.bz2 + cd scripts/kokoro + ./run.sh + + - name: Collect results + shell: bash + run: | + src=scripts/kokoro + + d=kokoro-en-v0_19 + mkdir $d + cp -a LICENSE $d/LICENSE + cp -a espeak-ng-data $d/ + cp -v $src/kokoro-v0_19_hf.onnx $d/model.onnx + cp -v $src/voices.bin $d/ + cp -v $src/tokens.txt $d/ + cp -v $src/README-new.md $d/README.md + ls -lh $d/ + tar cjfv $d.tar.bz2 $d + rm -rf $d + + ls -h $.tar.bz2 + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 huggingface + cd huggingface + rm -rf ./* + git fetch + git pull + + git lfs track "cmn_dict" + git lfs track "ru_dict" + git lfs track "*.wav" + + cp -a ../espeak-ng-data ./ + mkdir -p test_wavs + + cp -v ../scripts/kokoro/kokoro-v0_19_hf.onnx ./model.onnx + + cp -v ../scripts/kokoro/kokoro-v0_19_hf-*.wav ./test_wavs/ + + cp -v ../scripts/kokoro/tokens.txt . + cp -v ../scripts/kokoro/voices.bin . + cp -v ../scripts/kokoro/README-new.md ./README.md + cp -v ../LICENSE ./ + + git lfs track "*.onnx" + git add . + + ls -lh + + git status + + git commit -m "add models" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: tts-models diff --git a/scripts/kokoro/.gitignore b/scripts/kokoro/.gitignore new file mode 100644 index 00000000..3802b517 --- /dev/null +++ b/scripts/kokoro/.gitignore @@ -0,0 +1,3 @@ +voices.json +voices.bin +README-new.md diff --git a/scripts/kokoro/README.md b/scripts/kokoro/README.md new file mode 100644 index 00000000..5a0e09c2 --- /dev/null +++ b/scripts/kokoro/README.md @@ -0,0 +1,10 @@ +# Introduction + +This folder contains scripts for adding meta data to models +from https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files + +See also +https://huggingface.co/hexgrad/Kokoro-82M/tree/main +and +https://huggingface.co/spaces/hexgrad/Kokoro-TTS + diff --git a/scripts/kokoro/add-meta-data.py b/scripts/kokoro/add-meta-data.py new file mode 100755 index 00000000..cf42f13f --- /dev/null +++ b/scripts/kokoro/add-meta-data.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + + +import argparse +import json +from pathlib import Path + +import numpy as np +import onnx + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", type=str, required=True, help="input and output onnx model" + ) + + parser.add_argument("--voices", type=str, required=True, help="Path to voices.json") + return parser.parse_args() + + +def load_voices(filename): + with open(filename) as f: + voices = json.load(f) + for key in voices: + voices[key] = np.array(voices[key], dtype=np.float32) + return voices + + +def get_vocab(): + _pad = "$" + _punctuation = ';:,.!?¡¿—…"«»“” ' + _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" + symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + dicts = {} + for i in range(len((symbols))): + dicts[symbols[i]] = i + return dicts + + +def generate_tokens(): + token2id = get_vocab() + with open("tokens.txt", "w", encoding="utf-8") as f: + for s, i in token2id.items(): + f.write(f"{s} {i}\n") + + +def main(): + args = get_args() + print(args.model, args.voices) + + model = onnx.load(args.model) + voices = load_voices(args.voices) + + if Path("./tokens.txt").is_file(): + print("./tokens.txt exist, skip generating it") + else: + generate_tokens() + + keys = list(voices.keys()) + print(",".join(keys)) + + if Path("./voices.bin").is_file(): + print("./voices.bin exists, skip generating it") + else: + with open("voices.bin", "wb") as f: + for k in keys: + f.write(voices[k].tobytes()) + + meta_data = { + "model_type": "kokoro", + "language": "English", + "has_espeak": 1, + "sample_rate": 24000, + "version": 1, + "voice": "en-us", + "style_dim": ",".join(map(str, voices[keys[0]].shape)), + "n_speakers": len(keys), + "speaker_names": ",".join(keys), + "model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files", + "see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS", + "see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M", + "maintainer": "k2-fsa", + } + + print(model.metadata_props) + + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = str(value) + print("--------------------") + + print(model.metadata_props) + + onnx.save(model, args.model) + + print(f"Please see {args.model}, ./voices.bin, and ./tokens.txt") + + +if __name__ == "__main__": + main() diff --git a/scripts/kokoro/run.sh b/scripts/kokoro/run.sh new file mode 100755 index 00000000..422472ca --- /dev/null +++ b/scripts/kokoro/run.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang) + +set -ex + +cat > README-new.md < Dict[str, int]: + ans = dict() + with open(filename, encoding="utf-8") as f: + for line in f: + fields = line.strip().split() + if len(fields) == 2: + token, idx = fields + ans[token] = int(idx) + else: + assert len(fields) == 1, (len(fields), line) + ans[" "] = int(fields[0]) + return ans + + +def load_voices(speaker_names: List[str], dim: List[int], voices_bin: str): + embedding = ( + np.fromfile(voices_bin, dtype="uint8") + .view(np.float32) + .reshape(len(speaker_names), *dim) + ) + print("embedding.shape", embedding.shape) + ans = dict() + for i in range(len(speaker_names)): + ans[speaker_names[i]] = embedding[i] + + return ans + + +class OnnxModel: + def __init__(self, model_filename: str, voices_bin: str, tokens: str): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + self.model = ort.InferenceSession( + model_filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + self.token2id = load_tokens(tokens) + + meta = self.model.get_modelmeta().custom_metadata_map + print(meta) + dim = list(map(int, meta["style_dim"].split(","))) + speaker_names = meta["speaker_names"].split(",") + + self.voices = load_voices( + speaker_names=speaker_names, dim=dim, voices_bin=voices_bin + ) + + self.sample_rate = int(meta["sample_rate"]) + + print(list(self.voices.keys())) + # ['af', 'af_bella', 'af_nicole', 'af_sarah', 'af_sky', 'am_adam', + # 'am_michael', 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis'] + # af -> (511, 1, 256) + self.max_len = self.voices[next(iter(self.voices))].shape[0] - 1 + + def __call__(self, text: str, voice): + tokens = phonemize_espeak(text, "en-us") + # tokens is List[List[str]] + # Each sentence is a List[str] + # len(tokens) == number of sentences + + tokens = sum(tokens, []) # flatten + tokens = "".join(tokens) + + tokens = tokens.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace( + "kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ" + ) + + tokens = list(tokens) + + token_ids = [self.token2id[i] for i in tokens] + token_ids = token_ids[: self.max_len] + + style = self.voices[voice][len(token_ids)] + + token_ids = [0, *token_ids, 0] + token_ids = np.array([token_ids], dtype=np.int64) + + speed = np.array([1.0], dtype=np.float32) + + audio = self.model.run( + [ + self.model.get_outputs()[0].name, + ], + { + self.model.get_inputs()[0].name: token_ids, + self.model.get_inputs()[1].name: style, + self.model.get_inputs()[2].name: speed, + }, + )[0] + return audio + + +def test(model, voice, text) -> np.ndarray: + pass + + +def main(): + args = get_args() + print(vars(args)) + show(args.model) + + # tokens = phonemize_espeak("how are you doing?", "en-us") + # [['h', 'ˌ', 'a', 'ʊ', ' ', 'ɑ', 'ː', 'ɹ', ' ', 'j', 'u', 'ː', ' ', 'd', 'ˈ', 'u', 'ː', 'ɪ', 'ŋ', '?']] + m = OnnxModel( + model_filename=args.model, voices_bin=args.voices_bin, tokens=args.tokens + ) + + text = ( + "Today as always, men fall into two groups: slaves and free men." + + " Whoever does not have two-thirds of his day for himself, " + + "is a slave, whatever he may be: a statesman, a businessman, " + + "an official, or a scholar." + ) + + for i, voice in enumerate(m.voices.keys(), 1): + print(f"Testing {i}/{len(m.voices)} - {voice}/{args.model}") + + start = time.time() + audio = m(text, voice=voice) + end = time.time() + + elapsed_seconds = end - start + audio_duration = len(audio) / m.sample_rate + real_time_factor = elapsed_seconds / audio_duration + + filename = f"{Path(args.model).stem}-{voice}.wav" + sf.write( + filename, + audio, + samplerate=m.sample_rate, + subtype="PCM_16", + ) + print(f" Saved to {filename}") + print(f" Elapsed seconds: {elapsed_seconds:.3f}") + print(f" Audio duration in seconds: {audio_duration:.3f}") + print( + f" RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/melo-tts/export-onnx.py b/scripts/melo-tts/export-onnx.py index 69d9066e..84e7eaf1 100755 --- a/scripts/melo-tts/export-onnx.py +++ b/scripts/melo-tts/export-onnx.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# This script export ZH_EN TTS model, which supports both Chinese and English. +# This script exports ZH_EN TTS model, which supports both Chinese and English. # This model has only 1 speaker. from typing import Any, Dict