Add Kokoro v1.1-zh (#1942)

2025-02-28 15:47:59 +08:00
parent f5dfcf8d2f
commit dfcbc8d40b
20 changed files with 897 additions and 61 deletions
--- a/scripts/kokoro/v1.0/add_meta_data.py
+++ b/scripts/kokoro/v1.0/add_meta_data.py
@@ -2,11 +2,6 @@
 # Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)


-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
 import onnx
 import torch

--- a/scripts/kokoro/v1.0/generate_lexicon_en.py
+++ b/scripts/kokoro/v1.0/generate_lexicon_en.py
@@ -4,19 +4,6 @@
 import json
 from typing import List, Tuple

-from misaki import zh
-from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
-
-user_dict = {
-    "还田": [["huan2"], ["tian2"]],
-    "行长": [["hang2"], ["zhang3"]],
-    "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
-}
-
-load_phrases_dict(user_dict)
-
-phrases_dict.phrases_dict.update(**user_dict)
-

 def generate_english_lexicon(kind: str):
    assert kind in ("us", "gb"), kind
@@ -59,28 +46,6 @@ def generate_english_lexicon(kind: str):
    return list(user_defined_lower.items()) + list(lexicon.items())


-def generate_chinese_lexicon():
-    word_dict = pinyin_dict.pinyin_dict
-    phrases = phrases_dict.phrases_dict
-
-    g2p = zh.ZHG2P()
-    lexicon = []
-
-    for key in word_dict:
-        if not (0x4E00 <= key <= 0x9FFF):
-            continue
-        w = chr(key)
-        tokens: str = g2p.word2ipa(w)
-        tokens = tokens.replace(chr(815), "")
-        lexicon.append((w, tokens))
-
-    for key in phrases:
-        tokens: str = g2p.word2ipa(key)
-        tokens = tokens.replace(chr(815), "")
-        lexicon.append((key, tokens))
-    return lexicon
-
-
 def save(filename: str, lexicon: List[Tuple[str, str]]):
    with open(filename, "w", encoding="utf-8") as f:
        for word, phones in lexicon:
@@ -91,11 +56,9 @@ def save(filename: str, lexicon: List[Tuple[str, str]]):
 def main():
    us = generate_english_lexicon("us")
    gb = generate_english_lexicon("gb")
-    zh = generate_chinese_lexicon()

    save("lexicon-us-en.txt", us)
    save("lexicon-gb-en.txt", gb)
-    save("lexicon-zh.txt", zh)


 if __name__ == "__main__":
--- a/scripts/kokoro/v1.0/generate_lexicon_zh.py
+++ b/scripts/kokoro/v1.0/generate_lexicon_zh.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Fangjun Kuang)
+
+from typing import List, Tuple
+
+from misaki import zh
+from pypinyin import load_phrases_dict, phrases_dict, pinyin_dict
+
+user_dict = {
+    "还田": [["huan2"], ["tian2"]],
+    "行长": [["hang2"], ["zhang3"]],
+    "银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
+}
+
+load_phrases_dict(user_dict)
+
+phrases_dict.phrases_dict.update(**user_dict)
+
+
+def generate_chinese_lexicon():
+    word_dict = pinyin_dict.pinyin_dict
+    phrases = phrases_dict.phrases_dict
+
+    g2p = zh.ZHG2P()
+    lexicon = []
+
+    for key in word_dict:
+        if not (0x4E00 <= key <= 0x9FFF):
+            continue
+        w = chr(key)
+        tokens: str = g2p.word2ipa(w)
+        tokens = tokens.replace(chr(815), "")
+        lexicon.append((w, tokens))
+
+    for key in phrases:
+        tokens: str = g2p.word2ipa(key)
+        tokens = tokens.replace(chr(815), "")
+        lexicon.append((key, tokens))
+    return lexicon
+
+
+def save(filename: str, lexicon: List[Tuple[str, str]]):
+    with open(filename, "w", encoding="utf-8") as f:
+        for word, phones in lexicon:
+            tokens = " ".join(list(phones))
+            f.write(f"{word} {tokens}\n")
+
+
+def main():
+    zh = generate_chinese_lexicon()
+
+    save("lexicon-zh.txt", zh)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/kokoro/v1.0/run.sh
+++ b/scripts/kokoro/v1.0/run.sh
@@ -111,7 +111,11 @@ if [ ! -f ./tokens.txt ]; then
 fi

 if [ ! -f ./lexicon-zh.txt ]; then
-  ./generate_lexicon.py
+  ./generate_lexicon_zh.py
+fi
+
+if [[ ! -f ./lexicon-us-en.txt || ! -f ./lexicon-gb-en.txt ]]; then
+  ./generate_lexicon_en.py
 fi

 if [ ! -f ./voices.bin ]; then
--- a/scripts/kokoro/v1.0/test.py
+++ b/scripts/kokoro/v1.0/test.py
@@ -10,8 +10,6 @@ import jieba
 import numpy as np
 import onnxruntime as ort
 import soundfile as sf
-import torch
-from misaki import zh

 try:
    from piper_phonemize import phonemize_espeak
@@ -114,7 +112,6 @@ class OnnxModel:
    def __call__(self, text: str, voice: str):
        punctuations = ';:,.!?-…()"“”'
        text = text.lower()
-        g2p = zh.ZHG2P()

        tokens = ""