90 lines
2.3 KiB
Python
Executable File
90 lines
2.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||
|
||
import json
|
||
from pypinyin import phrases_dict, pinyin_dict
|
||
from misaki import zh
|
||
from typing import List, Tuple
|
||
|
||
|
||
def generate_english_lexicon(kind: str):
|
||
assert kind in ("us", "gb"), kind
|
||
# If you want to add new words, please add them to
|
||
# the user_defined dict.
|
||
user_defined = {
|
||
"Kokoro": "kˈOkəɹO",
|
||
"Misaki": "misˈɑki",
|
||
}
|
||
|
||
user_defined_lower = dict()
|
||
for k, v in user_defined.items():
|
||
user_defined_lower[k.lower()] = v
|
||
|
||
with open(f"./{kind}_gold.json", encoding="utf-8") as f:
|
||
gold = json.load(f)
|
||
|
||
with open(f"./{kind}_silver.json", encoding="utf-8") as f:
|
||
silver = json.load(f)
|
||
|
||
# words in us_gold has a higher priority than those in s_silver, so
|
||
# we put us_gold after us_silver below
|
||
english = {**silver, **gold}
|
||
|
||
lexicon = dict()
|
||
for k, v in english.items():
|
||
k_lower = k.lower()
|
||
|
||
if k_lower in user_defined_lower:
|
||
print(f"{k} already exist in the user defined dict. Skip adding")
|
||
continue
|
||
|
||
if isinstance(v, str):
|
||
lexicon[k_lower] = v
|
||
else:
|
||
assert isinstance(v, dict), (k, v)
|
||
assert "DEFAULT" in v, (k, v)
|
||
lexicon[k_lower] = v["DEFAULT"]
|
||
|
||
return list(user_defined_lower.items()) + list(lexicon.items())
|
||
|
||
|
||
def generate_chinese_lexicon():
|
||
word_dict = pinyin_dict.pinyin_dict
|
||
phrases = phrases_dict.phrases_dict
|
||
|
||
g2p = zh.ZHG2P()
|
||
lexicon = []
|
||
|
||
for key in word_dict:
|
||
if not (0x4E00 <= key <= 0x9FFF):
|
||
continue
|
||
w = chr(key)
|
||
tokens: str = g2p(w)
|
||
lexicon.append((w, tokens))
|
||
|
||
for key in phrases:
|
||
tokens: str = g2p(key)
|
||
lexicon.append((key, tokens))
|
||
return lexicon
|
||
|
||
|
||
def save(filename: str, lexicon: List[Tuple[str, str]]):
|
||
with open(filename, "w", encoding="utf-8") as f:
|
||
for word, phones in lexicon:
|
||
tokens = " ".join(list(phones))
|
||
f.write(f"{word} {tokens}\n")
|
||
|
||
|
||
def main():
|
||
us = generate_english_lexicon("us")
|
||
gb = generate_english_lexicon("gb")
|
||
zh = generate_chinese_lexicon()
|
||
|
||
save("lexicon-us-en.txt", us)
|
||
save("lexicon-gb-en.txt", gb)
|
||
save("lexicon-zh.txt", zh)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|