Add Kokoro v1.1-zh (#1942)
This commit is contained in:
132
scripts/kokoro/v1.1-zh/generate_lexicon_zh.py
Executable file
132
scripts/kokoro/v1.1-zh/generate_lexicon_zh.py
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
from misaki import zh
|
||||
from misaki.token import MToken
|
||||
from misaki.zh_frontend import ZH_MAP
|
||||
from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
|
||||
|
||||
user_dict = {
|
||||
"还田": [["huan2"], ["tian2"]],
|
||||
"行长": [["hang2"], ["zhang3"]],
|
||||
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
|
||||
}
|
||||
|
||||
load_phrases_dict(user_dict)
|
||||
|
||||
phrases_dict.phrases_dict.update(**user_dict)
|
||||
|
||||
|
||||
def process_text(self, text, with_erhua=True):
|
||||
"""
|
||||
This function is modified from
|
||||
https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155
|
||||
|
||||
Note that we have removed jieba.posseg.lcut().
|
||||
"""
|
||||
seg_cut = [(text, "v")]
|
||||
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
|
||||
tokens = []
|
||||
seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
|
||||
initials = []
|
||||
finals = []
|
||||
# pypinyin, g2pM
|
||||
for word, pos in seg_cut:
|
||||
if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
|
||||
pos = "X"
|
||||
elif pos != "x" and word in self.punc:
|
||||
pos = "x"
|
||||
tk = MToken(text=word, tag=pos, whitespace="")
|
||||
if pos in ("x", "eng"):
|
||||
if not word.isspace():
|
||||
if pos == "x" and word in self.punc:
|
||||
tk.phonemes = word
|
||||
tokens.append(tk)
|
||||
elif tokens:
|
||||
tokens[-1].whitespace += word
|
||||
continue
|
||||
elif (
|
||||
tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
|
||||
):
|
||||
tokens[-1].whitespace = "/"
|
||||
|
||||
# g2p
|
||||
sub_initials, sub_finals = self._get_initials_finals(word)
|
||||
# tone sandhi
|
||||
sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
|
||||
# er hua
|
||||
if with_erhua:
|
||||
sub_initials, sub_finals = self._merge_erhua(
|
||||
sub_initials, sub_finals, word, pos
|
||||
)
|
||||
|
||||
initials.append(sub_initials)
|
||||
finals.append(sub_finals)
|
||||
# assert len(sub_initials) == len(sub_finals) == len(word)
|
||||
|
||||
# sum(iterable[, start])
|
||||
# initials = sum(initials, [])
|
||||
# finals = sum(finals, [])
|
||||
|
||||
phones = []
|
||||
for c, v in zip(sub_initials, sub_finals):
|
||||
# NOTE: post process for pypinyin outputs
|
||||
# we discriminate i, ii and iii
|
||||
if c:
|
||||
phones.append(c)
|
||||
# replace punctuation by ` `
|
||||
# if c and c in self.punc:
|
||||
# phones.append(c)
|
||||
if v and (v not in self.punc or v != c): # and v not in self.rhy_phns:
|
||||
phones.append(v)
|
||||
phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
|
||||
phones = re.sub(r"(?=\d)", "_", phones).split("_")
|
||||
tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
|
||||
tokens.append(tk)
|
||||
|
||||
result = "".join(
|
||||
(self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
|
||||
for tk in tokens
|
||||
)
|
||||
|
||||
return result, tokens
|
||||
|
||||
|
||||
def generate_chinese_lexicon():
|
||||
word_dict = pinyin_dict.pinyin_dict
|
||||
phrases = phrases_dict.phrases_dict
|
||||
|
||||
g2p = zh.ZHG2P(version="1.1")
|
||||
|
||||
lexicon = []
|
||||
for key in word_dict:
|
||||
if not (0x4E00 <= key <= 0x9FFF):
|
||||
continue
|
||||
w = chr(key)
|
||||
tokens: str = process_text(g2p.frontend, w)[0]
|
||||
lexicon.append((w, tokens))
|
||||
|
||||
for key in phrases:
|
||||
tokens: str = process_text(g2p.frontend, key)[0]
|
||||
lexicon.append((key, tokens))
|
||||
return lexicon
|
||||
|
||||
|
||||
def save(filename: str, lexicon: List[Tuple[str, str]]):
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
for word, phones in lexicon:
|
||||
tokens = " ".join(list(phones))
|
||||
f.write(f"{word} {tokens}\n")
|
||||
|
||||
|
||||
def main():
|
||||
zh = generate_chinese_lexicon()
|
||||
|
||||
save("lexicon-zh.txt", zh)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user