Refactor kokoro export (#2302)
- generate samples for https://k2-fsa.github.io/sherpa/onnx/tts/all/ - provide int8 model for kokoro v0.19 kokoro-int8-en-v0_19.tar.bz2
This commit is contained in:
138
.github/workflows/export-kokoro.yaml
vendored
138
.github/workflows/export-kokoro.yaml
vendored
@@ -3,7 +3,7 @@ name: export-kokoro-to-onnx
|
|||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- fix-export-kokoro-1.0-2
|
- refactor-kokoro-2
|
||||||
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
@@ -34,24 +34,94 @@ jobs:
|
|||||||
- name: Install Python dependencies
|
- name: Install Python dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch
|
pip install kokoro "numpy<=1.26.4" onnx==1.16.0 onnxruntime==1.17.1 librosa soundfile piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html misaki[en] misaki[zh] torch==2.6.0+cpu -f https://download.pytorch.org/whl/torch sherpa-onnx
|
||||||
|
|
||||||
- name: Run
|
- name: Run
|
||||||
|
env:
|
||||||
|
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
|
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
|
||||||
tar xf espeak-ng-data.tar.bz2
|
tar xf espeak-ng-data.tar.bz2
|
||||||
rm espeak-ng-data.tar.bz2
|
rm espeak-ng-data.tar.bz2
|
||||||
|
cp -a ./espeak-ng-data ./scripts/kokoro/v0.19
|
||||||
|
cp -a ./espeak-ng-data ./scripts/kokoro/v1.0
|
||||||
|
cp -a ./espeak-ng-data ./scripts/kokoro/v1.1-zh
|
||||||
|
|
||||||
|
git config --global user.email "csukuangfj@gmail.com"
|
||||||
|
git config --global user.name "Fangjun Kuang"
|
||||||
|
|
||||||
cd scripts/kokoro
|
cd scripts/kokoro
|
||||||
v=${{ matrix.version }}
|
v=${{ matrix.version }}
|
||||||
if [[ $v = "0.19" ]]; then
|
if [[ $v = "0.19" ]]; then
|
||||||
|
cd v0.19
|
||||||
./run.sh
|
./run.sh
|
||||||
|
|
||||||
|
if false; then
|
||||||
|
# generate samples
|
||||||
|
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
|
||||||
|
mkdir -p hf/kokoro/v0.19/mp3
|
||||||
|
./generate_samples.py
|
||||||
|
pushd hf
|
||||||
|
git pull
|
||||||
|
git add .
|
||||||
|
git commit -m 'add kokoro samples for v0.19'
|
||||||
|
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
|
||||||
|
popd
|
||||||
|
rm -rf hf
|
||||||
|
fi
|
||||||
|
|
||||||
elif [[ $v == "1.0" ]]; then
|
elif [[ $v == "1.0" ]]; then
|
||||||
cd v1.0
|
cd v1.0
|
||||||
./run.sh
|
./run.sh
|
||||||
|
|
||||||
|
if false; then
|
||||||
|
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
|
||||||
|
mkdir -p hf/kokoro/v1.0/mp3
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
|
||||||
|
tar xvf dict.tar.bz2
|
||||||
|
rm dict.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
|
||||||
|
curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
|
||||||
|
curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
|
||||||
|
|
||||||
|
./generate_samples.py
|
||||||
|
pushd hf
|
||||||
|
git pull
|
||||||
|
git add .
|
||||||
|
git commit -m 'add kokoro samples for v1.0'
|
||||||
|
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
|
||||||
|
popd
|
||||||
|
rm -rf hf
|
||||||
|
fi
|
||||||
|
|
||||||
elif [[ $v == "1.1-zh" ]]; then
|
elif [[ $v == "1.1-zh" ]]; then
|
||||||
cd v1.1-zh
|
cd v1.1-zh
|
||||||
./run.sh
|
./run.sh
|
||||||
|
|
||||||
|
if false; then
|
||||||
|
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
|
||||||
|
mkdir -p hf/kokoro/v1.1-zh/mp3
|
||||||
|
|
||||||
|
curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
|
||||||
|
tar xvf dict.tar.bz2
|
||||||
|
rm dict.tar.bz2
|
||||||
|
|
||||||
|
curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
|
||||||
|
curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
|
||||||
|
curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
|
||||||
|
|
||||||
|
./generate_samples.py
|
||||||
|
pushd hf
|
||||||
|
git pull
|
||||||
|
git add .
|
||||||
|
git commit -m 'add kokoro samples for v1.1-zh'
|
||||||
|
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples main
|
||||||
|
popd
|
||||||
|
rm -rf hf
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "Unknown version $v"
|
echo "Unknown version $v"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -61,19 +131,39 @@ jobs:
|
|||||||
if: matrix.version == '0.19'
|
if: matrix.version == '0.19'
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
src=scripts/kokoro
|
src=scripts/kokoro/v0.19
|
||||||
|
|
||||||
d=kokoro-en-v0_19
|
d=kokoro-en-v0_19
|
||||||
|
|
||||||
mkdir $d
|
mkdir $d
|
||||||
cp -a LICENSE $d/LICENSE
|
cp -a LICENSE $d/LICENSE
|
||||||
cp -a espeak-ng-data $d/
|
cp -a espeak-ng-data $d/
|
||||||
cp -v $src/kokoro-v0_19.onnx $d/model.onnx
|
cp -v $src/model.onnx $d/model.onnx
|
||||||
cp -v $src/voices.bin $d/
|
cp -v $src/voices.bin $d/
|
||||||
cp -v $src/tokens.txt $d/
|
cp -v $src/tokens.txt $d/
|
||||||
cp -v $src/README-new.md $d/README.md
|
cp -v $src/../README.md $d/README.md
|
||||||
|
ls -lh $d/
|
||||||
|
tar cjfv $d.tar.bz2 $d
|
||||||
|
|
||||||
|
ls -lh $d.tar.bz2
|
||||||
|
|
||||||
|
- name: Collect results 0.19 (int8)
|
||||||
|
if: matrix.version == '0.19'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
src=scripts/kokoro/v0.19
|
||||||
|
|
||||||
|
d=kokoro-int8-en-v0_19
|
||||||
|
|
||||||
|
mkdir $d
|
||||||
|
cp -a LICENSE $d/LICENSE
|
||||||
|
cp -a espeak-ng-data $d/
|
||||||
|
cp -v $src/model.int8.onnx $d/model.int8.onnx
|
||||||
|
cp -v $src/voices.bin $d/
|
||||||
|
cp -v $src/tokens.txt $d/
|
||||||
|
cp -v $src/../README.md $d/README.md
|
||||||
ls -lh $d/
|
ls -lh $d/
|
||||||
tar cjfv $d.tar.bz2 $d
|
tar cjfv $d.tar.bz2 $d
|
||||||
rm -rf $d
|
|
||||||
|
|
||||||
ls -lh $d.tar.bz2
|
ls -lh $d.tar.bz2
|
||||||
|
|
||||||
@@ -219,33 +309,30 @@ jobs:
|
|||||||
git config --global user.email "csukuangfj@gmail.com"
|
git config --global user.email "csukuangfj@gmail.com"
|
||||||
git config --global user.name "Fangjun Kuang"
|
git config --global user.name "Fangjun Kuang"
|
||||||
|
|
||||||
rm -rf huggingface
|
dirs=(
|
||||||
|
kokoro-en-v0_19
|
||||||
|
# kokoro-int8-en-v0_19
|
||||||
|
)
|
||||||
|
|
||||||
export GIT_LFS_SKIP_SMUDGE=1
|
export GIT_LFS_SKIP_SMUDGE=1
|
||||||
export GIT_CLONE_PROTECTION_ACTIVE=false
|
export GIT_CLONE_PROTECTION_ACTIVE=false
|
||||||
|
for d in ${dirs[@]}; do
|
||||||
|
rm -rf huggingface
|
||||||
|
|
||||||
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 huggingface
|
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 huggingface
|
||||||
cd huggingface
|
cd huggingface
|
||||||
rm -rf ./*
|
rm -rf ./*
|
||||||
git fetch
|
|
||||||
git pull
|
|
||||||
|
|
||||||
git lfs track "cmn_dict"
|
|
||||||
git lfs track "ru_dict"
|
|
||||||
git lfs track "*.wav"
|
|
||||||
|
|
||||||
cp -a ../espeak-ng-data ./
|
|
||||||
mkdir -p test_wavs
|
|
||||||
|
|
||||||
cp -v ../scripts/kokoro/kokoro-v0_19.onnx ./model.onnx
|
|
||||||
|
|
||||||
cp -v ../scripts/kokoro/kokoro-v0_19-*.wav ./test_wavs/
|
|
||||||
|
|
||||||
cp -v ../scripts/kokoro/tokens.txt .
|
|
||||||
cp -v ../scripts/kokoro/voices.bin .
|
|
||||||
cp -v ../scripts/kokoro/README-new.md ./README.md
|
|
||||||
cp -v ../LICENSE ./
|
|
||||||
|
|
||||||
git lfs track "*.onnx"
|
git lfs track "*.onnx"
|
||||||
|
git lfs track af_dict
|
||||||
|
git lfs track ar_dict
|
||||||
|
git lfs track cmn_dict
|
||||||
|
git lfs track da_dict en_dict fa_dict hu_dict ia_dict it_dict lb_dict phondata ru_dict ta_dict
|
||||||
|
git lfs track ur_dict yue_dict
|
||||||
|
|
||||||
|
|
||||||
|
cp -a ../$d ./
|
||||||
|
|
||||||
git add .
|
git add .
|
||||||
|
|
||||||
ls -lh
|
ls -lh
|
||||||
@@ -254,6 +341,7 @@ jobs:
|
|||||||
|
|
||||||
git commit -m "add models"
|
git commit -m "add models"
|
||||||
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
|
git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/kokoro-en-v0_19 main || true
|
||||||
|
done
|
||||||
|
|
||||||
- name: Publish to huggingface 1.0 float32
|
- name: Publish to huggingface 1.0 float32
|
||||||
if: matrix.version == '1.0'
|
if: matrix.version == '1.0'
|
||||||
|
|||||||
1
scripts/kokoro/.gitignore
vendored
1
scripts/kokoro/.gitignore
vendored
@@ -1,3 +1,4 @@
|
|||||||
|
espeak-ng-data
|
||||||
voices.json
|
voices.json
|
||||||
voices.bin
|
voices.bin
|
||||||
README-new.md
|
README-new.md
|
||||||
|
|||||||
@@ -1,10 +1,6 @@
|
|||||||
# Introduction
|
# Introduction
|
||||||
|
|
||||||
This folder contains scripts for adding meta data to models
|
Please see also
|
||||||
from https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files
|
https://huggingface.co/hexgrad/Kokoro-82M
|
||||||
|
|
||||||
See also
|
|
||||||
https://huggingface.co/hexgrad/Kokoro-82M/tree/main
|
|
||||||
and
|
and
|
||||||
https://huggingface.co/spaces/hexgrad/Kokoro-TTS
|
https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
|
||||||
|
|
||||||
|
|||||||
@@ -1,117 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import onnx
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--model", type=str, required=True, help="input and output onnx model"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument("--voices", type=str, required=True, help="Path to voices.json")
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def load_voices(filename):
|
|
||||||
with open(filename) as f:
|
|
||||||
voices = json.load(f)
|
|
||||||
for key in voices:
|
|
||||||
voices[key] = np.array(voices[key], dtype=np.float32)
|
|
||||||
return voices
|
|
||||||
|
|
||||||
|
|
||||||
def get_vocab():
|
|
||||||
_pad = "$"
|
|
||||||
_punctuation = ';:,.!?¡¿—…"«»“” '
|
|
||||||
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
|
||||||
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
|
||||||
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
|
||||||
dicts = {}
|
|
||||||
for i in range(len((symbols))):
|
|
||||||
dicts[symbols[i]] = i
|
|
||||||
return dicts
|
|
||||||
|
|
||||||
|
|
||||||
def generate_tokens():
|
|
||||||
token2id = get_vocab()
|
|
||||||
with open("tokens.txt", "w", encoding="utf-8") as f:
|
|
||||||
for s, i in token2id.items():
|
|
||||||
f.write(f"{s} {i}\n")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = get_args()
|
|
||||||
print(args.model, args.voices)
|
|
||||||
|
|
||||||
model = onnx.load(args.model)
|
|
||||||
voices = load_voices(args.voices)
|
|
||||||
|
|
||||||
if Path("./tokens.txt").is_file():
|
|
||||||
print("./tokens.txt exist, skip generating it")
|
|
||||||
else:
|
|
||||||
generate_tokens()
|
|
||||||
|
|
||||||
keys = list(voices.keys())
|
|
||||||
print(",".join(keys))
|
|
||||||
|
|
||||||
if Path("./voices.bin").is_file():
|
|
||||||
print("./voices.bin exists, skip generating it")
|
|
||||||
else:
|
|
||||||
with open("voices.bin", "wb") as f:
|
|
||||||
for k in keys:
|
|
||||||
f.write(voices[k].tobytes())
|
|
||||||
|
|
||||||
speaker2id_str = ""
|
|
||||||
id2speaker_str = ""
|
|
||||||
sep = ""
|
|
||||||
for i, s in enumerate(keys):
|
|
||||||
speaker2id_str += f"{sep}{s}->{i}"
|
|
||||||
id2speaker_str += f"{sep}{i}->{s}"
|
|
||||||
sep = ","
|
|
||||||
|
|
||||||
meta_data = {
|
|
||||||
"model_type": "kokoro",
|
|
||||||
"language": "English",
|
|
||||||
"has_espeak": 1,
|
|
||||||
"sample_rate": 24000,
|
|
||||||
"version": 1,
|
|
||||||
"voice": "en-us",
|
|
||||||
"style_dim": ",".join(map(str, voices[keys[0]].shape)),
|
|
||||||
"n_speakers": len(keys),
|
|
||||||
"speaker2id": speaker2id_str,
|
|
||||||
"id2speaker": id2speaker_str,
|
|
||||||
"speaker_names": ",".join(keys),
|
|
||||||
"model_url": "https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files",
|
|
||||||
"see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
|
|
||||||
"see_also_2": "https://huggingface.co/hexgrad/Kokoro-82M",
|
|
||||||
"maintainer": "k2-fsa",
|
|
||||||
}
|
|
||||||
|
|
||||||
print(model.metadata_props)
|
|
||||||
|
|
||||||
while len(model.metadata_props):
|
|
||||||
model.metadata_props.pop()
|
|
||||||
|
|
||||||
for key, value in meta_data.items():
|
|
||||||
meta = model.metadata_props.add()
|
|
||||||
meta.key = key
|
|
||||||
meta.value = str(value)
|
|
||||||
print("--------------------")
|
|
||||||
|
|
||||||
print(model.metadata_props)
|
|
||||||
|
|
||||||
onnx.save(model, args.model)
|
|
||||||
|
|
||||||
print(f"Please see {args.model}, ./voices.bin, and ./tokens.txt")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
cat > README-new.md <<EOF
|
|
||||||
# Introduction
|
|
||||||
|
|
||||||
Files in this folder are from
|
|
||||||
https://github.com/thewh1teagle/kokoro-onnx/releases/tag/model-files
|
|
||||||
|
|
||||||
Please see also
|
|
||||||
https://huggingface.co/hexgrad/Kokoro-82M
|
|
||||||
and
|
|
||||||
https://huggingface.co/hexgrad/Kokoro-82M/discussions/14
|
|
||||||
EOF
|
|
||||||
|
|
||||||
files=(
|
|
||||||
# kokoro-v0_19_hf.onnx
|
|
||||||
kokoro-v0_19.onnx
|
|
||||||
# kokoro-quant.onnx
|
|
||||||
# kokoro-quant-convinteger.onnx
|
|
||||||
voices.json
|
|
||||||
)
|
|
||||||
|
|
||||||
for f in ${files[@]}; do
|
|
||||||
if [ ! -f ./$f ]; then
|
|
||||||
curl -SL -O https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/$f
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
models=(
|
|
||||||
kokoro-v0_19
|
|
||||||
# kokoro-quant
|
|
||||||
# kokoro-quant-convinteger
|
|
||||||
# kokoro-v0_19_hf
|
|
||||||
)
|
|
||||||
|
|
||||||
for m in ${models[@]}; do
|
|
||||||
./add_meta_data.py --model $m.onnx --voices ./voices.json
|
|
||||||
done
|
|
||||||
|
|
||||||
ls -l
|
|
||||||
echo "----------"
|
|
||||||
ls -lh
|
|
||||||
|
|
||||||
for m in ${models[@]}; do
|
|
||||||
./test.py --model $m.onnx --voices-bin ./voices.bin --tokens ./tokens.txt
|
|
||||||
done
|
|
||||||
ls -lh
|
|
||||||
1
scripts/kokoro/v0.19/.gitignore
vendored
Normal file
1
scripts/kokoro/v0.19/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
kLegacy
|
||||||
0
scripts/kokoro/v0.19/__init__.py
Normal file
0
scripts/kokoro/v0.19/__init__.py
Normal file
77
scripts/kokoro/v0.19/add_meta_data.py
Executable file
77
scripts/kokoro/v0.19/add_meta_data.py
Executable file
@@ -0,0 +1,77 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import onnx
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from generate_voices_bin import speaker2id
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=str, required=True, help="input and output onnx model"
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
print(args.model)
|
||||||
|
|
||||||
|
model = onnx.load(args.model)
|
||||||
|
|
||||||
|
style = torch.load(
|
||||||
|
"./kLegacy/v0.19/voices/af.pt", weights_only=True, map_location="cpu"
|
||||||
|
)
|
||||||
|
|
||||||
|
speaker2id_str = ""
|
||||||
|
id2speaker_str = ""
|
||||||
|
sep = ""
|
||||||
|
for s, i in speaker2id.items():
|
||||||
|
speaker2id_str += f"{sep}{s}->{i}"
|
||||||
|
id2speaker_str += f"{sep}{i}->{s}"
|
||||||
|
sep = ","
|
||||||
|
|
||||||
|
meta_data = {
|
||||||
|
"model_type": "kokoro",
|
||||||
|
"language": "English",
|
||||||
|
"has_espeak": 1,
|
||||||
|
"sample_rate": 24000,
|
||||||
|
"version": 1,
|
||||||
|
"voice": "en-us",
|
||||||
|
"style_dim": ",".join(map(str, style.shape)),
|
||||||
|
"n_speakers": len(speaker2id),
|
||||||
|
"speaker2id": speaker2id_str,
|
||||||
|
"id2speaker": id2speaker_str,
|
||||||
|
"speaker_names": ",".join(map(str, speaker2id.keys())),
|
||||||
|
"model_url": "https://huggingface.co/hexgrad/kLegacy/",
|
||||||
|
"see_also": "https://huggingface.co/spaces/hexgrad/Kokoro-TTS",
|
||||||
|
"maintainer": "k2-fsa",
|
||||||
|
"comment": "This is kokoro v0.19 and supports only English",
|
||||||
|
}
|
||||||
|
|
||||||
|
print(model.metadata_props)
|
||||||
|
|
||||||
|
while len(model.metadata_props):
|
||||||
|
model.metadata_props.pop()
|
||||||
|
|
||||||
|
for key, value in meta_data.items():
|
||||||
|
meta = model.metadata_props.add()
|
||||||
|
meta.key = key
|
||||||
|
meta.value = str(value)
|
||||||
|
print("--------------------")
|
||||||
|
|
||||||
|
print(model.metadata_props)
|
||||||
|
|
||||||
|
onnx.save(model, args.model)
|
||||||
|
|
||||||
|
print(f"Please see {args.model}, ./voices.bin, and ./tokens.txt")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
47
scripts/kokoro/v0.19/dynamic_quantization.py
Executable file
47
scripts/kokoro/v0.19/dynamic_quantization.py
Executable file
@@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import onnxruntime
|
||||||
|
from onnxruntime.quantization import QuantType, quantize_dynamic
|
||||||
|
|
||||||
|
|
||||||
|
def show(filename):
|
||||||
|
session_opts = onnxruntime.SessionOptions()
|
||||||
|
session_opts.log_severity_level = 3
|
||||||
|
sess = onnxruntime.InferenceSession(filename, session_opts)
|
||||||
|
for i in sess.get_inputs():
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
print("-----")
|
||||||
|
|
||||||
|
for i in sess.get_outputs():
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])
|
||||||
|
NodeArg(name='style', type='tensor(float)', shape=[1, 256])
|
||||||
|
NodeArg(name='speed', type='tensor(float)', shape=[1])
|
||||||
|
-----
|
||||||
|
NodeArg(name='audio', type='tensor(float)', shape=['audio0'])
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
show("./model.onnx")
|
||||||
|
|
||||||
|
if not Path("./model.int8.onnx").is_file():
|
||||||
|
quantize_dynamic(
|
||||||
|
model_input="model.onnx",
|
||||||
|
model_output="model.int8.onnx",
|
||||||
|
# op_types_to_quantize=["MatMul"],
|
||||||
|
weight_type=QuantType.QUInt8,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print("./model.int8.onnx exists - skip")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
40
scripts/kokoro/v0.19/generate_samples.py
Executable file
40
scripts/kokoro/v0.19/generate_samples.py
Executable file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
"""
|
||||||
|
Generate samples for
|
||||||
|
https://k2-fsa.github.io/sherpa/onnx/tts/all/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sherpa_onnx
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from generate_voices_bin import speaker2id
|
||||||
|
|
||||||
|
config = sherpa_onnx.OfflineTtsConfig(
|
||||||
|
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||||
|
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
|
||||||
|
model="./model.onnx",
|
||||||
|
voices="./voices.bin",
|
||||||
|
tokens="./tokens.txt",
|
||||||
|
data_dir="./espeak-ng-data",
|
||||||
|
),
|
||||||
|
num_threads=2,
|
||||||
|
),
|
||||||
|
max_num_sentences=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not config.validate():
|
||||||
|
raise ValueError("Please check your config")
|
||||||
|
|
||||||
|
tts = sherpa_onnx.OfflineTts(config)
|
||||||
|
text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."
|
||||||
|
|
||||||
|
for s, i in speaker2id.items():
|
||||||
|
print(s, i, len(speaker2id))
|
||||||
|
audio = tts.generate(text, sid=i, speed=1.0)
|
||||||
|
|
||||||
|
sf.write(
|
||||||
|
f"./hf/kokoro/v0.19/mp3/{i}-{s}.mp3",
|
||||||
|
audio.samples,
|
||||||
|
samplerate=audio.sample_rate,
|
||||||
|
)
|
||||||
26
scripts/kokoro/v0.19/generate_tokens.py
Executable file
26
scripts/kokoro/v0.19/generate_tokens.py
Executable file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
|
||||||
|
def get_vocab():
|
||||||
|
# https://huggingface.co/hexgrad/kLegacy/blob/main/v0.19/kokoro.py#L75
|
||||||
|
_pad = "$"
|
||||||
|
_punctuation = ';:,.!?¡¿—…"«»“” '
|
||||||
|
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||||
|
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
||||||
|
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
||||||
|
dicts = {}
|
||||||
|
for i in range(len((symbols))):
|
||||||
|
dicts[symbols[i]] = i
|
||||||
|
return dicts
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
token2id = get_vocab()
|
||||||
|
with open("tokens.txt", "w", encoding="utf-8") as f:
|
||||||
|
for s, i in token2id.items():
|
||||||
|
f.write(f"{s} {i}\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
42
scripts/kokoro/v0.19/generate_voices_bin.py
Executable file
42
scripts/kokoro/v0.19/generate_voices_bin.py
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
import torch
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
id2speaker = {
|
||||||
|
0: "af",
|
||||||
|
1: "af_bella",
|
||||||
|
2: "af_nicole",
|
||||||
|
3: "af_sarah",
|
||||||
|
4: "af_sky",
|
||||||
|
5: "am_adam",
|
||||||
|
6: "am_michael",
|
||||||
|
7: "bf_emma",
|
||||||
|
8: "bf_isabella",
|
||||||
|
9: "bm_george",
|
||||||
|
10: "bm_lewis",
|
||||||
|
}
|
||||||
|
|
||||||
|
speaker2id = {speaker: idx for idx, speaker in id2speaker.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if Path("./voices.bin").is_file():
|
||||||
|
print("./voices.bin exists - skip")
|
||||||
|
return
|
||||||
|
|
||||||
|
with open("voices.bin", "wb") as f:
|
||||||
|
for _, speaker in id2speaker.items():
|
||||||
|
m = torch.load(
|
||||||
|
f"kLegacy/v0.19/voices/{speaker}.pt",
|
||||||
|
weights_only=True,
|
||||||
|
map_location="cpu",
|
||||||
|
).numpy()
|
||||||
|
# m.shape (511, 1, 256)
|
||||||
|
|
||||||
|
f.write(m.tobytes())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
36
scripts/kokoro/v0.19/run.sh
Executable file
36
scripts/kokoro/v0.19/run.sh
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
cat > README-new.md <<EOF
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
Files in this folder are from
|
||||||
|
git clone https://huggingface.co/hexgrad/kLegacy
|
||||||
|
EOF
|
||||||
|
|
||||||
|
if [ ! -d kLegacy ]; then
|
||||||
|
git clone https://huggingface.co/hexgrad/kLegacy
|
||||||
|
pushd kLegacy/v0.19
|
||||||
|
git lfs pull
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./voices.bin ]; then
|
||||||
|
./generate_voices_bin.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./tokens.txt ]; then
|
||||||
|
./generate_tokens.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ./model.onnx ]; then
|
||||||
|
mv kLegacy/v0.19/kokoro-v0_19.onnx ./model.onnx
|
||||||
|
fi
|
||||||
|
|
||||||
|
./add_meta_data.py --model ./model.onnx
|
||||||
|
|
||||||
|
if [ ! -f model.int8.onnx ]; then
|
||||||
|
./dynamic_quantization.py
|
||||||
|
fi
|
||||||
@@ -67,11 +67,13 @@ def show(filename):
|
|||||||
print(i)
|
print(i)
|
||||||
|
|
||||||
|
|
||||||
# NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])
|
"""
|
||||||
# NodeArg(name='style', type='tensor(float)', shape=[1, 256])
|
NodeArg(name='tokens', type='tensor(int64)', shape=[1, 'tokens1'])
|
||||||
# NodeArg(name='speed', type='tensor(float)', shape=[1])
|
NodeArg(name='style', type='tensor(float)', shape=[1, 256])
|
||||||
# -----
|
NodeArg(name='speed', type='tensor(float)', shape=[1])
|
||||||
# NodeArg(name='audio', type='tensor(float)', shape=['audio0'])
|
-----
|
||||||
|
NodeArg(name='audio', type='tensor(float)', shape=['audio0'])
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def load_tokens(filename: str) -> Dict[str, int]:
|
def load_tokens(filename: str) -> Dict[str, int]:
|
||||||
@@ -171,10 +173,6 @@ class OnnxModel:
|
|||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
||||||
def test(model, voice, text) -> np.ndarray:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = get_args()
|
args = get_args()
|
||||||
print(vars(args))
|
print(vars(args))
|
||||||
46
scripts/kokoro/v1.0/generate_samples.py
Executable file
46
scripts/kokoro/v1.0/generate_samples.py
Executable file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
"""
|
||||||
|
Generate samples for
|
||||||
|
https://k2-fsa.github.io/sherpa/onnx/tts/all/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sherpa_onnx
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from generate_voices_bin import speaker2id
|
||||||
|
|
||||||
|
config = sherpa_onnx.OfflineTtsConfig(
|
||||||
|
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||||
|
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
|
||||||
|
model="./kokoro.onnx",
|
||||||
|
voices="./voices.bin",
|
||||||
|
tokens="./tokens.txt",
|
||||||
|
data_dir="./espeak-ng-data",
|
||||||
|
dict_dir="./dict",
|
||||||
|
lexicon="./lexicon-zh.txt,./lexicon-us-en.txt",
|
||||||
|
),
|
||||||
|
num_threads=2,
|
||||||
|
debug=True,
|
||||||
|
),
|
||||||
|
rule_fsts="./phone-zh.fst,./date-zh.fst,./number-zh.fst",
|
||||||
|
max_num_sentences=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not config.validate():
|
||||||
|
raise ValueError("Please check your config")
|
||||||
|
|
||||||
|
tts = sherpa_onnx.OfflineTts(config)
|
||||||
|
text = "This model supports both Chinese and English. 小米的核心价值观是什么?答案是真诚热爱!有困难,请拨打110 或者18601200909。I am learning 机器学习. 我在研究 machine learning。What do you think 中英文说的如何呢? 今天是 2025年6月18号."
|
||||||
|
|
||||||
|
print("text", text)
|
||||||
|
|
||||||
|
for s, i in speaker2id.items():
|
||||||
|
print(s, i, len(speaker2id))
|
||||||
|
audio = tts.generate(text, sid=i, speed=1.0)
|
||||||
|
|
||||||
|
sf.write(
|
||||||
|
f"./hf/kokoro/v1.0/mp3/{i}-{s}.mp3",
|
||||||
|
audio.samples,
|
||||||
|
samplerate=audio.sample_rate,
|
||||||
|
)
|
||||||
46
scripts/kokoro/v1.1-zh/generate_samples.py
Executable file
46
scripts/kokoro/v1.1-zh/generate_samples.py
Executable file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
"""
|
||||||
|
Generate samples for
|
||||||
|
https://k2-fsa.github.io/sherpa/onnx/tts/all/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sherpa_onnx
|
||||||
|
import soundfile as sf
|
||||||
|
|
||||||
|
from generate_voices_bin import speaker2id
|
||||||
|
|
||||||
|
config = sherpa_onnx.OfflineTtsConfig(
|
||||||
|
model=sherpa_onnx.OfflineTtsModelConfig(
|
||||||
|
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
|
||||||
|
model="./kokoro.onnx",
|
||||||
|
voices="./voices.bin",
|
||||||
|
tokens="./tokens.txt",
|
||||||
|
data_dir="./espeak-ng-data",
|
||||||
|
dict_dir="./dict",
|
||||||
|
lexicon="./lexicon-zh.txt,./lexicon-us-en.txt",
|
||||||
|
),
|
||||||
|
num_threads=2,
|
||||||
|
debug=True,
|
||||||
|
),
|
||||||
|
rule_fsts="./phone-zh.fst,./date-zh.fst,./number-zh.fst",
|
||||||
|
max_num_sentences=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not config.validate():
|
||||||
|
raise ValueError("Please check your config")
|
||||||
|
|
||||||
|
tts = sherpa_onnx.OfflineTts(config)
|
||||||
|
text = "This model supports both Chinese and English. 小米的核心价值观是什么?答案是真诚热爱!有困难,请拨打110 或者18601200909。I am learning 机器学习. 我在研究 machine learning。What do you think 中英文说的如何呢? 今天是 2025年6月18号."
|
||||||
|
|
||||||
|
print("text", text)
|
||||||
|
|
||||||
|
for s, i in speaker2id.items():
|
||||||
|
print(s, i, len(speaker2id))
|
||||||
|
audio = tts.generate(text, sid=i, speed=1.0)
|
||||||
|
|
||||||
|
sf.write(
|
||||||
|
f"./hf/kokoro/v1.1-zh/mp3/{i}-{s}.mp3",
|
||||||
|
audio.samples,
|
||||||
|
samplerate=audio.sample_rate,
|
||||||
|
)
|
||||||
@@ -11,6 +11,8 @@ fi
|
|||||||
if [ ! -f config.json ]; then
|
if [ ! -f config.json ]; then
|
||||||
# see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
|
# see https://huggingface.co/hexgrad/Kokoro-82M/blob/main/config.json
|
||||||
curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json
|
curl -SL -O https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh/resolve/main/config.json
|
||||||
|
mkdir -p Kokoro-82M
|
||||||
|
cp ./config.json ./Kokoro-82M
|
||||||
fi
|
fi
|
||||||
|
|
||||||
voices=(
|
voices=(
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ static void PybindOfflineTtsConfig(py::module *m) {
|
|||||||
.def(py::init<const OfflineTtsModelConfig &, const std::string &,
|
.def(py::init<const OfflineTtsModelConfig &, const std::string &,
|
||||||
const std::string &, int32_t, float>(),
|
const std::string &, int32_t, float>(),
|
||||||
py::arg("model"), py::arg("rule_fsts") = "",
|
py::arg("model"), py::arg("rule_fsts") = "",
|
||||||
py::arg("rule_fars") = "", py::arg("max_num_sentences") = 2,
|
py::arg("rule_fars") = "", py::arg("max_num_sentences") = 1,
|
||||||
py::arg("silence_scale") = 0.2)
|
py::arg("silence_scale") = 0.2)
|
||||||
.def_readwrite("model", &PyClass::model)
|
.def_readwrite("model", &PyClass::model)
|
||||||
.def_readwrite("rule_fsts", &PyClass::rule_fsts)
|
.def_readwrite("rule_fsts", &PyClass::rule_fsts)
|
||||||
|
|||||||
Reference in New Issue
Block a user