This repository has been archived on 2025-08-26. You can view files and clone it, but cannot push or open issues or pull requests.
Files
enginex-mr_series-sherpa-onnx/scripts/kokoro/v1.0/generate_lexicon.py
2025-02-05 08:24:43 +08:00

90 lines
2.3 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
import json
from pypinyin import phrases_dict, pinyin_dict
from misaki import zh
from typing import List, Tuple
def generate_english_lexicon(kind: str):
assert kind in ("us", "gb"), kind
# If you want to add new words, please add them to
# the user_defined dict.
user_defined = {
"Kokoro": "kˈOkəɹO",
"Misaki": "misˈɑki",
}
user_defined_lower = dict()
for k, v in user_defined.items():
user_defined_lower[k.lower()] = v
with open(f"./{kind}_gold.json", encoding="utf-8") as f:
gold = json.load(f)
with open(f"./{kind}_silver.json", encoding="utf-8") as f:
silver = json.load(f)
# words in us_gold has a higher priority than those in s_silver, so
# we put us_gold after us_silver below
english = {**silver, **gold}
lexicon = dict()
for k, v in english.items():
k_lower = k.lower()
if k_lower in user_defined_lower:
print(f"{k} already exist in the user defined dict. Skip adding")
continue
if isinstance(v, str):
lexicon[k_lower] = v
else:
assert isinstance(v, dict), (k, v)
assert "DEFAULT" in v, (k, v)
lexicon[k_lower] = v["DEFAULT"]
return list(user_defined_lower.items()) + list(lexicon.items())
def generate_chinese_lexicon():
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
g2p = zh.ZHG2P()
lexicon = []
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens: str = g2p(w)
lexicon.append((w, tokens))
for key in phrases:
tokens: str = g2p(key)
lexicon.append((key, tokens))
return lexicon
def save(filename: str, lexicon: List[Tuple[str, str]]):
with open(filename, "w", encoding="utf-8") as f:
for word, phones in lexicon:
tokens = " ".join(list(phones))
f.write(f"{word} {tokens}\n")
def main():
us = generate_english_lexicon("us")
gb = generate_english_lexicon("gb")
zh = generate_chinese_lexicon()
save("lexicon-us-en.txt", us)
save("lexicon-gb-en.txt", gb)
save("lexicon-zh.txt", zh)
if __name__ == "__main__":
main()