Add Kokoro v1.1-zh (#1942)

2025-02-28 15:47:59 +08:00
parent f5dfcf8d2f
commit dfcbc8d40b
20 changed files with 897 additions and 61 deletions
--- a/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py
+++ b/scripts/kokoro/v1.1-zh/generate_lexicon_zh.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+import re
+from typing import List, Tuple
+
+from misaki import zh
+from misaki.token import MToken
+from misaki.zh_frontend import ZH_MAP
+from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
+
+user_dict = {
+    "还田": [["huan2"], ["tian2"]],
+    "行长": [["hang2"], ["zhang3"]],
+    "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
+}
+
+load_phrases_dict(user_dict)
+
+phrases_dict.phrases_dict.update(**user_dict)
+
+
+def process_text(self, text, with_erhua=True):
+    """
+    This function is modified from
+    https://github.com/hexgrad/misaki/blob/main/misaki/zh_frontend.py#L155
+
+    Note that we have removed jieba.posseg.lcut().
+    """
+    seg_cut = [(text, "v")]
+    seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+    tokens = []
+    seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+    initials = []
+    finals = []
+    # pypinyin, g2pM
+    for word, pos in seg_cut:
+        if pos == "x" and "\u4E00" <= min(word) and max(word) <= "\u9FFF":
+            pos = "X"
+        elif pos != "x" and word in self.punc:
+            pos = "x"
+        tk = MToken(text=word, tag=pos, whitespace="")
+        if pos in ("x", "eng"):
+            if not word.isspace():
+                if pos == "x" and word in self.punc:
+                    tk.phonemes = word
+                tokens.append(tk)
+            elif tokens:
+                tokens[-1].whitespace += word
+            continue
+        elif (
+            tokens and tokens[-1].tag not in ("x", "eng") and not tokens[-1].whitespace
+        ):
+            tokens[-1].whitespace = "/"
+
+        # g2p
+        sub_initials, sub_finals = self._get_initials_finals(word)
+        # tone sandhi
+        sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals)
+        # er hua
+        if with_erhua:
+            sub_initials, sub_finals = self._merge_erhua(
+                sub_initials, sub_finals, word, pos
+            )
+
+        initials.append(sub_initials)
+        finals.append(sub_finals)
+        # assert len(sub_initials) == len(sub_finals) == len(word)
+
+        # sum(iterable[, start])
+        # initials = sum(initials, [])
+        # finals = sum(finals, [])
+
+        phones = []
+        for c, v in zip(sub_initials, sub_finals):
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c:
+                phones.append(c)
+            # replace punctuation by ` `
+            # if c and c in self.punc:
+            #     phones.append(c)
+            if v and (v not in self.punc or v != c):  # and v not in self.rhy_phns:
+                phones.append(v)
+        phones = "_".join(phones).replace("_eR", "_er").replace("R", "_R")
+        phones = re.sub(r"(?=\d)", "_", phones).split("_")
+        tk.phonemes = "".join(ZH_MAP.get(p, self.unk) for p in phones)
+        tokens.append(tk)
+
+    result = "".join(
+        (self.unk if tk.phonemes is None else tk.phonemes) + tk.whitespace
+        for tk in tokens
+    )
+
+    return result, tokens
+
+
+def generate_chinese_lexicon():
+    word_dict = pinyin_dict.pinyin_dict
+    phrases = phrases_dict.phrases_dict
+
+    g2p = zh.ZHG2P(version="1.1")
+
+    lexicon = []
+    for key in word_dict:
+        if not (0x4E00 <= key <= 0x9FFF):
+            continue
+        w = chr(key)
+        tokens: str = process_text(g2p.frontend, w)[0]
+        lexicon.append((w, tokens))
+
+    for key in phrases:
+        tokens: str = process_text(g2p.frontend, key)[0]
+        lexicon.append((key, tokens))
+    return lexicon
+
+
+def save(filename: str, lexicon: List[Tuple[str, str]]):
+    with open(filename, "w", encoding="utf-8") as f:
+        for word, phones in lexicon:
+            tokens = " ".join(list(phones))
+            f.write(f"{word} {tokens}\n")
+
+
+def main():
+    zh = generate_chinese_lexicon()
+
+    save("lexicon-zh.txt", zh)
+
+
+if __name__ == "__main__":
+    main()