update README

2025-09-10 10:47:02 +08:00
parent 5088f0b50a
commit ff78032400
603 changed files with 21 additions and 23 deletions
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/.gitignore
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/.gitignore
@@ -0,0 +1,3 @@
+G2PWModel
+__pycache__
+*.zip
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/LangSegmenter/init.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/LangSegmenter/init.py
@@ -0,0 +1 @@
+from .langsegmenter import LangSegmenter
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
@@ -0,0 +1,225 @@
+import logging
+import re
+
+# jieba静音
+import jieba
+jieba.setLogLevel(logging.CRITICAL)
+
+# 更改fast_langdetect大模型位置
+from pathlib import Path
+import fast_langdetect
+fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
+
+
+from split_lang import LangSplitter
+
+
+def full_en(text):
+    pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
+    return bool(re.match(pattern, text))
+
+
+def full_cjk(text):
+    # 来自wiki
+    cjk_ranges = [
+        (0x4E00, 0x9FFF),        # CJK Unified Ideographs
+        (0x3400, 0x4DB5),        # CJK Extension A
+        (0x20000, 0x2A6DD),      # CJK Extension B
+        (0x2A700, 0x2B73F),      # CJK Extension C
+        (0x2B740, 0x2B81F),      # CJK Extension D
+        (0x2B820, 0x2CEAF),      # CJK Extension E
+        (0x2CEB0, 0x2EBEF),      # CJK Extension F
+        (0x30000, 0x3134A),      # CJK Extension G
+        (0x31350, 0x323AF),      # CJK Extension H
+        (0x2EBF0, 0x2EE5D),      # CJK Extension H
+    ]
+
+    pattern = r'[0-9、-〜。！？.!?… /]+$'
+
+    cjk_text = ""
+    for char in text:
+        code_point = ord(char)
+        in_cjk = any(start <= code_point <= end for start, end in cjk_ranges)
+        if in_cjk or re.match(pattern, char):
+            cjk_text += char
+    return cjk_text
+
+
+def split_jako(tag_lang,item):
+    if tag_lang == "ja":
+        pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。！？.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
+    else:
+        pattern = r"([\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]+(?:[0-9、-〜。！？.!?… ]+[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]*)*)"
+
+    lang_list: list[dict] = []
+    tag = 0
+    for match in re.finditer(pattern, item['text']):
+        if match.start() > tag:
+            lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
+
+        tag = match.end()
+        lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
+
+    if tag < len(item['text']):
+        lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
+
+    return lang_list
+
+
+def merge_lang(lang_list, item):
+    if lang_list and item['lang'] == lang_list[-1]['lang']:
+        lang_list[-1]['text'] += item['text']
+    else:
+        lang_list.append(item)
+    return lang_list
+
+
+class LangSegmenter():
+    # 默认过滤器, 基于gsv目前四种语言
+    DEFAULT_LANG_MAP = {
+        "zh": "zh",
+        "yue": "zh",  # 粤语
+        "wuu": "zh",  # 吴语
+        "zh-cn": "zh",
+        "zh-tw": "x", # 繁体设置为x
+        "ko": "ko",
+        "ja": "ja",
+        "en": "en",
+    }
+
+    def getTexts(text,default_lang = ""):
+        lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
+        lang_splitter.merge_across_digit = False
+        substr = lang_splitter.split_by_lang(text=text)
+
+        lang_list: list[dict] = []
+
+        have_num = False
+
+        for _, item in enumerate(substr):
+            dict_item = {'lang':item.lang,'text':item.text}
+
+            if dict_item['lang'] == 'digit':
+                if default_lang != "":
+                    dict_item['lang'] = default_lang
+                else:
+                    have_num = True
+                lang_list = merge_lang(lang_list,dict_item)
+                continue
+
+            # 处理短英文被识别为其他语言的问题
+            if full_en(dict_item['text']):  
+                dict_item['lang'] = 'en'
+                lang_list = merge_lang(lang_list,dict_item)
+                continue
+
+            if default_lang != "":
+                dict_item['lang'] = default_lang
+                lang_list = merge_lang(lang_list,dict_item)
+                continue
+            else:
+                # 处理非日语夹日文的问题(不包含CJK)
+                ja_list: list[dict] = []
+                if dict_item['lang'] != 'ja':
+                    ja_list = split_jako('ja',dict_item)
+
+                if not ja_list:
+                    ja_list.append(dict_item)
+
+                # 处理非韩语夹韩语的问题(不包含CJK)
+                ko_list: list[dict] = []
+                temp_list: list[dict] = []
+                for _, ko_item in enumerate(ja_list):
+                    if ko_item["lang"] != 'ko':
+                        ko_list = split_jako('ko',ko_item)
+
+                    if ko_list:
+                        temp_list.extend(ko_list)
+                    else:
+                        temp_list.append(ko_item)
+
+                # 未存在非日韩文夹日韩文
+                if len(temp_list) == 1:
+                    # 未知语言检查是否为CJK
+                    if dict_item['lang'] == 'x':
+                        cjk_text = full_cjk(dict_item['text'])
+                        if cjk_text:
+                            dict_item = {'lang':'zh','text':cjk_text}
+                            lang_list = merge_lang(lang_list,dict_item)
+                        else:
+                            lang_list = merge_lang(lang_list,dict_item)
+                        continue
+                    else:
+                        lang_list = merge_lang(lang_list,dict_item)
+                        continue
+
+                # 存在非日韩文夹日韩文
+                for _, temp_item in enumerate(temp_list):
+                    # 未知语言检查是否为CJK
+                    if temp_item['lang'] == 'x':
+                        cjk_text = full_cjk(temp_item['text'])
+                        if cjk_text:
+                            lang_list = merge_lang(lang_list,{'lang':'zh','text':cjk_text})
+                        else:
+                            lang_list = merge_lang(lang_list,temp_item)
+                    else:
+                        lang_list = merge_lang(lang_list,temp_item)
+
+        # 有数字
+        if have_num:
+            temp_list = lang_list
+            lang_list = []
+            for i, temp_item in enumerate(temp_list):
+                if temp_item['lang'] == 'digit':
+                    if default_lang:
+                        temp_item['lang'] = default_lang
+                    elif lang_list and i == len(temp_list) - 1:
+                        temp_item['lang'] = lang_list[-1]['lang']
+                    elif not lang_list and i < len(temp_list) - 1:
+                        temp_item['lang'] = temp_list[1]['lang']
+                    elif lang_list and i < len(temp_list) - 1:
+                        if lang_list[-1]['lang'] == temp_list[i + 1]['lang']:
+                            temp_item['lang'] = lang_list[-1]['lang']
+                        elif lang_list[-1]['text'][-1] in [",",".","!","?","，","。","！","？"]:
+                            temp_item['lang'] = temp_list[i + 1]['lang']
+                        elif temp_list[i + 1]['text'][0] in [",",".","!","?","，","。","！","？"]:
+                            temp_item['lang'] = lang_list[-1]['lang']
+                        elif temp_item['text'][-1] in ["。","."]:
+                            temp_item['lang'] = lang_list[-1]['lang']
+                        elif len(lang_list[-1]['text']) >= len(temp_list[i + 1]['text']):
+                            temp_item['lang'] = lang_list[-1]['lang']
+                        else:
+                            temp_item['lang'] = temp_list[i + 1]['lang']
+                    else:
+                        temp_item['lang'] = 'zh'
+
+                lang_list = merge_lang(lang_list,temp_item)
+
+
+        # 筛X
+        temp_list = lang_list
+        lang_list = []
+        for _, temp_item in enumerate(temp_list):
+            if temp_item['lang'] == 'x':
+                if lang_list:
+                    temp_item['lang'] = lang_list[-1]['lang']
+                elif len(temp_list) > 1:
+                    temp_item['lang'] = temp_list[1]['lang']
+                else:
+                    temp_item['lang'] = 'zh'
+
+            lang_list = merge_lang(lang_list,temp_item)
+
+        return lang_list
+    
+
+if __name__ == "__main__":
+    text = "MyGO?,你也喜欢まいご吗？"
+    print(LangSegmenter.getTexts(text))
+
+    text = "ねえ、知ってる？最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
+    print(LangSegmenter.getTexts(text))
+
+    text = "当时ThinkPad T60刚刚发布，一同推出的还有一款名为Advanced Dock的扩展坞配件。这款扩展坞通过连接T60底部的插槽，扩展出包括PCIe在内的一大堆接口，并且自带电源，让T60可以安装桌面显卡来提升性能。"
+    print(LangSegmenter.getTexts(text,"zh"))
+    print(LangSegmenter.getTexts(text))
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/init.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/init.py
@@ -0,0 +1,28 @@
+import os
+# if os.environ.get("version","v1")=="v1":
+#   from text.symbols import symbols
+# else:
+#   from text.symbols2 import symbols
+
+from text import symbols as symbols_v1
+from text import symbols2 as symbols_v2
+
+_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
+_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
+
+
+def cleaned_text_to_sequence(cleaned_text, version=None):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    if version is None:
+        version = os.environ.get("version", "v2")
+    if version == "v1":
+        phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
+    else:
+        phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
+
+    return phones
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cantonese.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cantonese.py
@@ -0,0 +1,222 @@
+# reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py
+
+import re
+import cn2an
+import ToJyutping
+
+from text.symbols import punctuation
+from text.zh_normalization.text_normlization import TextNormalizer
+
+normalizer = lambda x: cn2an.transform(x, "an2cn")
+
+INITIALS = [
+    "aa",
+    "aai",
+    "aak",
+    "aap",
+    "aat",
+    "aau",
+    "ai",
+    "au",
+    "ap",
+    "at",
+    "ak",
+    "a",
+    "p",
+    "b",
+    "e",
+    "ts",
+    "t",
+    "dz",
+    "d",
+    "kw",
+    "k",
+    "gw",
+    "g",
+    "f",
+    "h",
+    "l",
+    "m",
+    "ng",
+    "n",
+    "s",
+    "y",
+    "w",
+    "c",
+    "z",
+    "j",
+    "ong",
+    "on",
+    "ou",
+    "oi",
+    "ok",
+    "o",
+    "uk",
+    "ung",
+]
+INITIALS += ["sp", "spl", "spn", "sil"]
+
+
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": "…",
+    "$": ".",
+    "“": "'",
+    "”": "'",
+    '"': "'",
+    "‘": "'",
+    "’": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    "—": "-",
+    "～": "-",
+    "~": "-",
+    "「": "'",
+    "」": "'",
+}
+
+
+def replace_punctuation(text):
+    # text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
+
+    return replaced_text
+
+
+def text_normalize(text):
+    tx = TextNormalizer()
+    sentences = tx.normalize(text)
+    dest_text = ""
+    for sentence in sentences:
+        dest_text += replace_punctuation(sentence)
+    return dest_text
+
+
+punctuation_set = set(punctuation)
+
+
+def jyuping_to_initials_finals_tones(jyuping_syllables):
+    initials_finals = []
+    tones = []
+    word2ph = []
+
+    for syllable in jyuping_syllables:
+        if syllable in punctuation:
+            initials_finals.append(syllable)
+            tones.append(0)
+            word2ph.append(1)  # Add 1 for punctuation
+        elif syllable == "_":
+            initials_finals.append(syllable)
+            tones.append(0)
+            word2ph.append(1)  # Add 1 for underscore
+        else:
+            try:
+                tone = int(syllable[-1])
+                syllable_without_tone = syllable[:-1]
+            except ValueError:
+                tone = 0
+                syllable_without_tone = syllable
+
+            for initial in INITIALS:
+                if syllable_without_tone.startswith(initial):
+                    if syllable_without_tone.startswith("nga"):
+                        initials_finals.extend(
+                            [
+                                syllable_without_tone[:2],
+                                syllable_without_tone[2:] or syllable_without_tone[-1],
+                            ]
+                        )
+                        # tones.extend([tone, tone])
+                        tones.extend([-1, tone])
+                        word2ph.append(2)
+                    else:
+                        final = syllable_without_tone[len(initial) :] or initial[-1]
+                        initials_finals.extend([initial, final])
+                        # tones.extend([tone, tone])
+                        tones.extend([-1, tone])
+                        word2ph.append(2)
+                    break
+    assert len(initials_finals) == len(tones)
+
+    ###魔改为辅音+带音调的元音
+    phones = []
+    for a, b in zip(initials_finals, tones):
+        if b not in [-1, 0]:  ###防止粤语和普通话重合开头加Y，如果是标点，不加。
+            todo = "%s%s" % (a, b)
+        else:
+            todo = a
+        if todo not in punctuation_set:
+            todo = "Y%s" % todo
+        phones.append(todo)
+
+    # return initials_finals, tones, word2ph
+    return phones, word2ph
+
+
+def get_jyutping(text):
+    jyutping_array = []
+    punct_pattern = re.compile(r"^[{}]+$".format(re.escape("".join(punctuation))))
+
+    syllables = ToJyutping.get_jyutping_list(text)
+
+    for word, syllable in syllables:
+        if punct_pattern.match(word):
+            puncts = re.split(r"([{}])".format(re.escape("".join(punctuation))), word)
+            for punct in puncts:
+                if len(punct) > 0:
+                    jyutping_array.append(punct)
+        else:
+            # match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
+            if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", syllable):
+                raise ValueError(f"Failed to convert {word} to jyutping: {syllable}")
+            jyutping_array.append(syllable)
+
+    return jyutping_array
+
+
+def get_bert_feature(text, word2ph):
+    from text import chinese_bert
+
+    return chinese_bert.get_bert_feature(text, word2ph)
+
+
+def g2p(text):
+    # word2ph = []
+    jyuping = get_jyutping(text)
+    # print(jyuping)
+    # phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
+    phones, word2ph = jyuping_to_initials_finals_tones(jyuping)
+    # phones = ["_"] + phones + ["_"]
+    # tones = [0] + tones + [0]
+    # word2ph = [1] + word2ph + [1]
+    return phones, word2ph
+
+
+if __name__ == "__main__":
+    # text = "啊！但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
+    text = "佢個鋤頭太短啦。"
+    text = text_normalize(text)
+    # phones, tones, word2ph = g2p(text)
+    phones, word2ph = g2p(text)
+    # print(phones, tones, word2ph)
+    print(phones, word2ph)
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/chinese.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/chinese.py
@@ -0,0 +1,194 @@
+import os
+import re
+
+import cn2an
+from pypinyin import lazy_pinyin, Style
+
+from text.symbols import punctuation
+from text.tone_sandhi import ToneSandhi
+from text.zh_normalization.text_normlization import TextNormalizer
+
+normalizer = lambda x: cn2an.transform(x, "an2cn")
+
+current_file_path = os.path.dirname(__file__)
+pinyin_to_symbol_map = {
+    line.split("\t")[0]: line.strip().split("\t")[1]
+    for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
+}
+
+import jieba_fast
+import logging
+
+jieba_fast.setLogLevel(logging.CRITICAL)
+import jieba_fast.posseg as psg
+
+
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": "…",
+    "$": ".",
+    "/": ",",
+    "—": "-",
+    "~": "…",
+    "～": "…",
+}
+
+tone_modifier = ToneSandhi()
+
+
+def replace_punctuation(text):
+    text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
+
+    return replaced_text
+
+
+def replace_punctuation_with_en(text):
+    text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
+
+    return replaced_text
+
+
+def replace_consecutive_punctuation(text):
+    punctuations = "".join(re.escape(p) for p in punctuation)
+    pattern = f"([{punctuations}])([{punctuations}])+"
+    result = re.sub(pattern, r"\1", text)
+    return result
+
+
+def g2p(text):
+    pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
+    sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
+    phones, word2ph = _g2p(sentences)
+    return phones, word2ph
+
+
+def _get_initials_finals(word):
+    initials = []
+    finals = []
+    orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
+    orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+    for c, v in zip(orig_initials, orig_finals):
+        initials.append(c)
+        finals.append(v)
+    return initials, finals
+
+
+def _g2p(segments):
+    phones_list = []
+    word2ph = []
+    for seg in segments:
+        pinyins = []
+        # Replace all English words in the sentence
+        seg = re.sub("[a-zA-Z]+", "", seg)
+        seg_cut = psg.lcut(seg)
+        initials = []
+        finals = []
+        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
+        for word, pos in seg_cut:
+            if pos == "eng":
+                continue
+            sub_initials, sub_finals = _get_initials_finals(word)
+            sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
+            initials.append(sub_initials)
+            finals.append(sub_finals)
+
+            # assert len(sub_initials) == len(sub_finals) == len(word)
+        initials = sum(initials, [])
+        finals = sum(finals, [])
+        #
+        for c, v in zip(initials, finals):
+            raw_pinyin = c + v
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c == v:
+                assert c in punctuation
+                phone = [c]
+                word2ph.append(1)
+            else:
+                v_without_tone = v[:-1]
+                tone = v[-1]
+
+                pinyin = c + v_without_tone
+                assert tone in "12345"
+
+                if c:
+                    # 多音节
+                    v_rep_map = {
+                        "uei": "ui",
+                        "iou": "iu",
+                        "uen": "un",
+                    }
+                    if v_without_tone in v_rep_map.keys():
+                        pinyin = c + v_rep_map[v_without_tone]
+                else:
+                    # 单音节
+                    pinyin_rep_map = {
+                        "ing": "ying",
+                        "i": "yi",
+                        "in": "yin",
+                        "u": "wu",
+                    }
+                    if pinyin in pinyin_rep_map.keys():
+                        pinyin = pinyin_rep_map[pinyin]
+                    else:
+                        single_rep_map = {
+                            "v": "yu",
+                            "e": "e",
+                            "i": "y",
+                            "u": "w",
+                        }
+                        if pinyin[0] in single_rep_map.keys():
+                            pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
+
+                assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
+                new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
+                new_v = new_v + tone
+                phone = [new_c, new_v]
+                word2ph.append(len(phone))
+
+            phones_list += phone
+    return phones_list, word2ph
+
+
+def text_normalize(text):
+    # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
+    tx = TextNormalizer()
+    sentences = tx.normalize(text)
+    dest_text = ""
+    for sentence in sentences:
+        dest_text += replace_punctuation(sentence)
+
+    # 避免重复标点引起的参考泄露
+    dest_text = replace_consecutive_punctuation(dest_text)
+    return dest_text
+
+
+if __name__ == "__main__":
+    text = "啊——但是《原神》是由,米哈\游自主，研发的一款全.新开放世界.冒险游戏"
+    text = "呣呣呣～就是…大人的鼹鼠党吧？"
+    text = "你好"
+    text = text_normalize(text)
+    print(g2p(text))
+
+
+# # 示例用法
+# text = "这是一个示例文本：,你好！这是一个测试..."
+# print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/chinese2.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/chinese2.py
@@ -0,0 +1,339 @@
+import os
+import re
+
+import cn2an
+from pypinyin import lazy_pinyin, Style
+from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials
+
+from text.symbols import punctuation
+from text.tone_sandhi import ToneSandhi
+from text.zh_normalization.text_normlization import TextNormalizer
+
+normalizer = lambda x: cn2an.transform(x, "an2cn")
+
+current_file_path = os.path.dirname(__file__)
+pinyin_to_symbol_map = {
+    line.split("\t")[0]: line.strip().split("\t")[1]
+    for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
+}
+
+import jieba_fast
+import logging
+
+jieba_fast.setLogLevel(logging.CRITICAL)
+import jieba_fast.posseg as psg
+
+# is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启
+# is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False
+is_g2pw = True  # True if is_g2pw_str.lower() == 'true' else False
+if is_g2pw:
+    # print("当前使用g2pw进行拼音推理")
+    from text.g2pw import G2PWPinyin, correct_pronunciation
+
+    parent_directory = os.path.dirname(current_file_path)
+    g2pw = G2PWPinyin(
+        model_dir="GPT_SoVITS/text/G2PWModel",
+        model_source=os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),
+        v_to_u=False,
+        neutral_tone_with_five=True,
+    )
+
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": "…",
+    "$": ".",
+    "/": ",",
+    "—": "-",
+    "~": "…",
+    "～": "…",
+}
+
+tone_modifier = ToneSandhi()
+
+
+def replace_punctuation(text):
+    text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
+
+    return replaced_text
+
+
+def g2p(text):
+    pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
+    sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
+    phones, word2ph = _g2p(sentences)
+    return phones, word2ph
+
+
+def _get_initials_finals(word):
+    initials = []
+    finals = []
+
+    orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
+    orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+
+    for c, v in zip(orig_initials, orig_finals):
+        initials.append(c)
+        finals.append(v)
+    return initials, finals
+
+
+must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"}
+not_erhua = {
+    "虐儿",
+    "为儿",
+    "护儿",
+    "瞒儿",
+    "救儿",
+    "替儿",
+    "有儿",
+    "一儿",
+    "我儿",
+    "俺儿",
+    "妻儿",
+    "拐儿",
+    "聋儿",
+    "乞儿",
+    "患儿",
+    "幼儿",
+    "孤儿",
+    "婴儿",
+    "婴幼儿",
+    "连体儿",
+    "脑瘫儿",
+    "流浪儿",
+    "体弱儿",
+    "混血儿",
+    "蜜雪儿",
+    "舫儿",
+    "祖儿",
+    "美儿",
+    "应采儿",
+    "可儿",
+    "侄儿",
+    "孙儿",
+    "侄孙儿",
+    "女儿",
+    "男儿",
+    "红孩儿",
+    "花儿",
+    "虫儿",
+    "马儿",
+    "鸟儿",
+    "猪儿",
+    "猫儿",
+    "狗儿",
+    "少儿",
+}
+
+
+def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) -> list[list[str]]:
+    """
+    Do erhub.
+    """
+    # fix er1
+    for i, phn in enumerate(finals):
+        if i == len(finals) - 1 and word[i] == "儿" and phn == "er1":
+            finals[i] = "er2"
+
+    # 发音
+    if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}):
+        return initials, finals
+
+    # "……" 等情况直接返回
+    if len(finals) != len(word):
+        return initials, finals
+
+    assert len(finals) == len(word)
+
+    # 与前一个字发同音
+    new_initials = []
+    new_finals = []
+    for i, phn in enumerate(finals):
+        if (
+            i == len(finals) - 1
+            and word[i] == "儿"
+            and phn in {"er2", "er5"}
+            and word[-2:] not in not_erhua
+            and new_finals
+        ):
+            phn = "er" + new_finals[-1][-1]
+
+        new_initials.append(initials[i])
+        new_finals.append(phn)
+
+    return new_initials, new_finals
+
+
+def _g2p(segments):
+    phones_list = []
+    word2ph = []
+    for seg in segments:
+        pinyins = []
+        # Replace all English words in the sentence
+        seg = re.sub("[a-zA-Z]+", "", seg)
+        seg_cut = psg.lcut(seg)
+        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
+        initials = []
+        finals = []
+
+        if not is_g2pw:
+            for word, pos in seg_cut:
+                if pos == "eng":
+                    continue
+                sub_initials, sub_finals = _get_initials_finals(word)
+                sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
+                # 儿化
+                sub_initials, sub_finals = _merge_erhua(sub_initials, sub_finals, word, pos)
+                initials.append(sub_initials)
+                finals.append(sub_finals)
+                # assert len(sub_initials) == len(sub_finals) == len(word)
+            initials = sum(initials, [])
+            finals = sum(finals, [])
+            print("pypinyin结果", initials, finals)
+        else:
+            # g2pw采用整句推理
+            pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
+
+            pre_word_length = 0
+            for word, pos in seg_cut:
+                sub_initials = []
+                sub_finals = []
+                now_word_length = pre_word_length + len(word)
+
+                if pos == "eng":
+                    pre_word_length = now_word_length
+                    continue
+
+                word_pinyins = pinyins[pre_word_length:now_word_length]
+
+                # 多音字消歧
+                word_pinyins = correct_pronunciation(word, word_pinyins)
+
+                for pinyin in word_pinyins:
+                    if pinyin[0].isalpha():
+                        sub_initials.append(to_initials(pinyin))
+                        sub_finals.append(to_finals_tone3(pinyin, neutral_tone_with_five=True))
+                    else:
+                        sub_initials.append(pinyin)
+                        sub_finals.append(pinyin)
+
+                pre_word_length = now_word_length
+                sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
+                # 儿化
+                sub_initials, sub_finals = _merge_erhua(sub_initials, sub_finals, word, pos)
+                initials.append(sub_initials)
+                finals.append(sub_finals)
+
+            initials = sum(initials, [])
+            finals = sum(finals, [])
+            # print("g2pw结果",initials,finals)
+
+        for c, v in zip(initials, finals):
+            raw_pinyin = c + v
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c == v:
+                assert c in punctuation
+                phone = [c]
+                word2ph.append(1)
+            else:
+                v_without_tone = v[:-1]
+                tone = v[-1]
+
+                pinyin = c + v_without_tone
+                assert tone in "12345"
+
+                if c:
+                    # 多音节
+                    v_rep_map = {
+                        "uei": "ui",
+                        "iou": "iu",
+                        "uen": "un",
+                    }
+                    if v_without_tone in v_rep_map.keys():
+                        pinyin = c + v_rep_map[v_without_tone]
+                else:
+                    # 单音节
+                    pinyin_rep_map = {
+                        "ing": "ying",
+                        "i": "yi",
+                        "in": "yin",
+                        "u": "wu",
+                    }
+                    if pinyin in pinyin_rep_map.keys():
+                        pinyin = pinyin_rep_map[pinyin]
+                    else:
+                        single_rep_map = {
+                            "v": "yu",
+                            "e": "e",
+                            "i": "y",
+                            "u": "w",
+                        }
+                        if pinyin[0] in single_rep_map.keys():
+                            pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
+
+                assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
+                new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
+                new_v = new_v + tone
+                phone = [new_c, new_v]
+                word2ph.append(len(phone))
+
+            phones_list += phone
+    return phones_list, word2ph
+
+
+def replace_punctuation_with_en(text):
+    text = text.replace("嗯", "恩").replace("呣", "母")
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
+
+    return replaced_text
+
+
+def replace_consecutive_punctuation(text):
+    punctuations = "".join(re.escape(p) for p in punctuation)
+    pattern = f"([{punctuations}])([{punctuations}])+"
+    result = re.sub(pattern, r"\1", text)
+    return result
+
+
+def text_normalize(text):
+    # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
+    tx = TextNormalizer()
+    sentences = tx.normalize(text)
+    dest_text = ""
+    for sentence in sentences:
+        dest_text += replace_punctuation(sentence)
+
+    # 避免重复标点引起的参考泄露
+    dest_text = replace_consecutive_punctuation(dest_text)
+    return dest_text
+
+
+if __name__ == "__main__":
+    text = "啊——但是《原神》是由,米哈\游自主，研发的一款全.新开放世界.冒险游戏"
+    text = "呣呣呣～就是…大人的鼹鼠党吧？"
+    text = "你好"
+    text = text_normalize(text)
+    print(g2p(text))
+
+
+# # 示例用法
+# text = "这是一个示例文本：,你好！这是一个测试..."
+# print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cleaner.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cleaner.py
@@ -0,0 +1,94 @@
+from text import cleaned_text_to_sequence
+import os
+# if os.environ.get("version","v1")=="v1":
+#     from text import chinese
+#     from text.symbols import symbols
+# else:
+#     from text import chinese2 as chinese
+#     from text.symbols2 import symbols
+
+from text import symbols as symbols_v1
+from text import symbols2 as symbols_v2
+
+special = [
+    # ("%", "zh", "SP"),
+    ("￥", "zh", "SP2"),
+    ("^", "zh", "SP3"),
+    # ('@', 'zh', "SP4")#不搞鬼畜了，和第二版保持一致吧
+]
+
+
+def clean_text(text, language, version=None):
+    if version is None:
+        version = os.environ.get("version", "v2")
+    if version == "v1":
+        symbols = symbols_v1.symbols
+        language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
+    else:
+        symbols = symbols_v2.symbols
+        language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
+
+    if language not in language_module_map:
+        language = "en"
+        text = " "
+    for special_s, special_l, target_symbol in special:
+        if special_s in text and language == special_l:
+            return clean_special(text, language, special_s, target_symbol, version)
+    language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
+    if hasattr(language_module, "text_normalize"):
+        norm_text = language_module.text_normalize(text)
+    else:
+        norm_text = text
+    if language == "zh" or language == "yue":  ##########
+        phones, word2ph = language_module.g2p(norm_text)
+        assert len(phones) == sum(word2ph)
+        assert len(norm_text) == len(word2ph)
+    elif language == "en":
+        phones = language_module.g2p(norm_text)
+        if len(phones) < 4:
+            phones = [","] + phones
+        word2ph = None
+    else:
+        phones = language_module.g2p(norm_text)
+        word2ph = None
+    phones = ["UNK" if ph not in symbols else ph for ph in phones]
+    return phones, word2ph, norm_text
+
+
+def clean_special(text, language, special_s, target_symbol, version=None):
+    if version is None:
+        version = os.environ.get("version", "v2")
+    if version == "v1":
+        symbols = symbols_v1.symbols
+        language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
+    else:
+        symbols = symbols_v2.symbols
+        language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
+
+    """
+    特殊静音段sp符号处理
+    """
+    text = text.replace(special_s, ",")
+    language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
+    norm_text = language_module.text_normalize(text)
+    phones = language_module.g2p(norm_text)
+    new_ph = []
+    for ph in phones[0]:
+        assert ph in symbols
+        if ph == ",":
+            new_ph.append(target_symbol)
+        else:
+            new_ph.append(ph)
+    return new_ph, phones[1], norm_text
+
+
+def text_to_sequence(text, language, version=None):
+    version = os.environ.get("version", version)
+    if version is None:
+        version = "v2"
+    phones = clean_text(text)
+    return cleaned_text_to_sequence(phones, version)
+
+
+if __name__ == "__main__":
+    print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cmudict-fast.rep
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cmudict-fast.rep
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cmudict.rep
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/cmudict.rep
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/en_normalization/expend.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/en_normalization/expend.py
@@ -0,0 +1,283 @@
+# by https://github.com/Cosmo-klara
+
+from __future__ import print_function
+
+import re
+import inflect
+import unicodedata
+
+# 后缀计量单位替换表
+measurement_map = {
+    "m": ["meter", "meters"],
+    "km": ["kilometer", "kilometers"],
+    "km/h": ["kilometer per hour", "kilometers per hour"],
+    "ft": ["feet", "feet"],
+    "L": ["liter", "liters"],
+    "tbsp": ["tablespoon", "tablespoons"],
+    "tsp": ["teaspoon", "teaspoons"],
+    "h": ["hour", "hours"],
+    "min": ["minute", "minutes"],
+    "s": ["second", "seconds"],
+    "°C": ["degree celsius", "degrees celsius"],
+    "°F": ["degree fahrenheit", "degrees fahrenheit"],
+}
+
+
+# 识别 12,000 类型
+_inflect = inflect.engine()
+
+# 转化数字序数词
+_ordinal_number_re = re.compile(r"\b([0-9]+)\. ")
+
+# 我听说好像对于数字正则识别其实用 \d 会好一点
+
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+
+# 时间识别
+_time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b")
+
+# 后缀计量单位识别
+_measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b")
+
+# 前后 £ 识别 ( 写了识别两边某一边的，但是不知道为什么失败了┭┮﹏┭┮ )
+_pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)")
+_pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£")
+
+# 前后 $ 识别
+_dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$")
+
+# 小数的识别
+_decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)")
+
+# 分数识别 (形式 "3/4" )
+_fraction_re = re.compile(r"([0-9]+/[0-9]+)")
+
+# 序数词识别
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+
+# 数字处理
+_number_re = re.compile(r"[0-9]+")
+
+
+def _convert_ordinal(m):
+    """
+    标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
+    Examples:
+        input: "1. "
+        output: "1st"
+    然后在后面的 _expand_ordinal, 将其转化为 first 这类的
+    """
+    ordinal = _inflect.ordinal(m.group(1))
+    return ordinal + ", "
+
+
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+
+
+def _expand_time(m):
+    """
+    将 24 小时制的时间转换为 12 小时制的时间表示方式。
+
+    Examples:
+        input: "13:00 / 4:00 / 13:30"
+        output: "one o'clock p.m. / four o'clock am. / one thirty p.m."
+    """
+    hours, minutes = map(int, m.group(1, 2))
+    period = "a.m." if hours < 12 else "p.m."
+    if hours > 12:
+        hours -= 12
+
+    hour_word = _inflect.number_to_words(hours)
+    minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ""
+
+    if minutes == 0:
+        return f"{hour_word} o'clock {period}"
+    else:
+        return f"{hour_word} {minute_word} {period}"
+
+
+def _expand_measurement(m):
+    """
+    处理一些常见的测量单位后缀, 目前支持: m, km, km/h, ft, L, tbsp, tsp, h, min, s, °C, °F
+    如果要拓展的话修改: _measurement_re 和 measurement_map
+    """
+    sign = m.group(3)
+    ptr = 1
+    # 想不到怎么方便的取数字，又懒得改正则，诶，1.2 反正也是复数读法，干脆直接去掉 "."
+    num = int(m.group(1).replace(sign, "").replace(".", ""))
+    decimal_part = m.group(2)
+    # 上面判断的漏洞，比如 0.1 的情况，在这里排除了
+    if decimal_part == None and num == 1:
+        ptr = 0
+    return m.group(1).replace(sign, " " + measurement_map[sign][ptr])
+
+
+def _expand_pounds(m):
+    """
+    没找到特别规范的说明，和美元的处理一样，其实可以把两个合并在一起
+    """
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " pounds"  # Unexpected format
+    pounds = int(parts[0]) if parts[0] else 0
+    pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
+    if pounds and pence:
+        pound_unit = "pound" if pounds == 1 else "pounds"
+        penny_unit = "penny" if pence == 1 else "pence"
+        return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit)
+    elif pounds:
+        pound_unit = "pound" if pounds == 1 else "pounds"
+        return "%s %s" % (pounds, pound_unit)
+    elif pence:
+        penny_unit = "penny" if pence == 1 else "pence"
+        return "%s %s" % (pence, penny_unit)
+    else:
+        return "zero pounds"
+
+
+def _expand_dollars(m):
+    """
+    change: 美分是 100 的限值, 应该要做补零的吧
+    Example:
+        input: "32.3$ / $6.24"
+        output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents"
+    """
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+
+
+# 小数的处理
+def _expand_decimal_number(m):
+    """
+    Example:
+        input: "13.234"
+        output: "thirteen point two three four"
+    """
+    match = m.group(1)
+    parts = match.split(".")
+    words = []
+    # 遍历字符串中的每个字符
+    for char in parts[1]:
+        if char == ".":
+            words.append("point")
+        else:
+            words.append(char)
+    return parts[0] + " point " + " ".join(words)
+
+
+# 分数的处理
+def _expend_fraction(m):
+    """
+    规则1: 分子使用基数词读法, 分母用序数词读法.
+    规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法.
+    规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves.
+    Examples:
+
+    | Written |	Said |
+    |:---:|:---:|
+    | 1/3 | one third |
+    | 3/4 | three fourths |
+    | 5/6 | five sixths |
+    | 1/2 | one half |
+    | 3/2 | three halves |
+    """
+    match = m.group(0)
+    numerator, denominator = map(int, match.split("/"))
+
+    numerator_part = _inflect.number_to_words(numerator)
+    if denominator == 2:
+        if numerator == 1:
+            denominator_part = "half"
+        else:
+            denominator_part = "halves"
+    elif denominator == 1:
+        return f"{numerator_part}"
+    else:
+        denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator))
+        if numerator > 1:
+            denominator_part += "s"
+
+    return f"{numerator_part} {denominator_part}"
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+
+
+def normalize(text):
+    """
+    !!! 所有的处理都需要正确的输入 !!!
+    可以添加新的处理，只需要添加正则表达式和对应的处理函数即可
+    """
+
+    text = re.sub(_ordinal_number_re, _convert_ordinal, text)
+    text = re.sub(r"(?<!\d)-|-(?!\d)", " minus ", text)
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_time_re, _expand_time, text)
+    text = re.sub(_measurement_re, _expand_measurement, text)
+    text = re.sub(_pounds_re_start, _expand_pounds, text)
+    text = re.sub(_pounds_re_end, _expand_pounds, text)
+    text = re.sub(_dollars_re_start, _expand_dollars, text)
+    text = re.sub(_dollars_re_end, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_number, text)
+    text = re.sub(_fraction_re, _expend_fraction, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+
+    text = "".join(
+        char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn"
+    )  # Strip accents
+
+    text = re.sub("%", " percent", text)
+    text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
+    text = re.sub(r"(?i)i\.e\.", "that is", text)
+    text = re.sub(r"(?i)e\.g\.", "for example", text)
+    # 增加纯大写单词拆分
+    text = re.sub(r"(?<!^)(?<![\s])([A-Z])", r" \1", text)
+    return text
+
+
+if __name__ == "__main__":
+    # 我觉得其实可以把切分结果展示出来（只读，或者修改不影响传给TTS的实际text）
+    # 然后让用户确认后再输入给 TTS，可以让用户检查自己有没有不标准的输入
+    print(normalize("1. test ordinal number 1st"))
+    print(normalize("32.3$, $6.24, 1.1£, £7.14."))
+    print(normalize("3/23, 1/2, 3/2, 1/3, 6/1"))
+    print(normalize("1st, 22nd"))
+    print(normalize("a test 20h, 1.2s, 1L, 0.1km"))
+    print(normalize("a test of time 4:00, 13:00, 13:30"))
+    print(normalize("a test of temperature 4°F, 23°C, -19°C"))
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/engdict-hot.rep
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/engdict-hot.rep
@@ -0,0 +1,3 @@
+CHATGPT CH AE1 T JH IY1 P IY1 T IY1
+JSON JH EY1 S AH0 N
+CONDA K AA1 N D AH0
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/engdict_cache.pickle
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/engdict_cache.pickle
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/english.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/english.py
@@ -0,0 +1,374 @@
+import pickle
+import os
+import re
+import wordsegment
+from g2p_en import G2p
+
+from text.symbols import punctuation
+
+from text.symbols2 import symbols
+
+from builtins import str as unicode
+from text.en_normalization.expend import normalize
+from nltk.tokenize import TweetTokenizer
+
+word_tokenize = TweetTokenizer().tokenize
+from nltk import pos_tag
+
+current_file_path = os.path.dirname(__file__)
+CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
+CMU_DICT_FAST_PATH = os.path.join(current_file_path, "cmudict-fast.rep")
+CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
+CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
+NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
+
+
+# 适配中文及 g2p_en 标点
+rep_map = {
+    "[;:：，；]": ",",
+    '["’]': "'",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+}
+
+
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+
+
+def replace_phs(phs):
+    rep_map = {"'": "-"}
+    phs_new = []
+    for ph in phs:
+        if ph in symbols:
+            phs_new.append(ph)
+        elif ph in rep_map.keys():
+            phs_new.append(rep_map[ph])
+        else:
+            print("ph not in symbols: ", ph)
+    return phs_new
+
+
+def replace_consecutive_punctuation(text):
+    punctuations = "".join(re.escape(p) for p in punctuation)
+    pattern = f"([{punctuations}\s])([{punctuations}])+"
+    result = re.sub(pattern, r"\1", text)
+    return result
+
+
+def read_dict():
+    g2p_dict = {}
+    start_line = 49
+    with open(CMU_DICT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= start_line:
+                line = line.strip()
+                word_split = line.split("  ")
+                word = word_split[0].lower()
+
+                syllable_split = word_split[1].split(" - ")
+                g2p_dict[word] = []
+                for syllable in syllable_split:
+                    phone_split = syllable.split(" ")
+                    g2p_dict[word].append(phone_split)
+
+            line_index = line_index + 1
+            line = f.readline()
+
+    return g2p_dict
+
+
+def read_dict_new():
+    g2p_dict = {}
+    with open(CMU_DICT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= 57:
+                line = line.strip()
+                word_split = line.split("  ")
+                word = word_split[0].lower()
+                g2p_dict[word] = [word_split[1].split(" ")]
+
+            line_index = line_index + 1
+            line = f.readline()
+
+    with open(CMU_DICT_FAST_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= 0:
+                line = line.strip()
+                word_split = line.split(" ")
+                word = word_split[0].lower()
+                if word not in g2p_dict:
+                    g2p_dict[word] = [word_split[1:]]
+
+            line_index = line_index + 1
+            line = f.readline()
+
+    return g2p_dict
+
+
+def hot_reload_hot(g2p_dict):
+    with open(CMU_DICT_HOT_PATH) as f:
+        line = f.readline()
+        line_index = 1
+        while line:
+            if line_index >= 0:
+                line = line.strip()
+                word_split = line.split(" ")
+                word = word_split[0].lower()
+                # 自定义发音词直接覆盖字典
+                g2p_dict[word] = [word_split[1:]]
+
+            line_index = line_index + 1
+            line = f.readline()
+
+    return g2p_dict
+
+
+def cache_dict(g2p_dict, file_path):
+    with open(file_path, "wb") as pickle_file:
+        pickle.dump(g2p_dict, pickle_file)
+
+
+def get_dict():
+    if os.path.exists(CACHE_PATH):
+        with open(CACHE_PATH, "rb") as pickle_file:
+            g2p_dict = pickle.load(pickle_file)
+    else:
+        g2p_dict = read_dict_new()
+        cache_dict(g2p_dict, CACHE_PATH)
+
+    g2p_dict = hot_reload_hot(g2p_dict)
+
+    return g2p_dict
+
+
+def get_namedict():
+    if os.path.exists(NAMECACHE_PATH):
+        with open(NAMECACHE_PATH, "rb") as pickle_file:
+            name_dict = pickle.load(pickle_file)
+    else:
+        name_dict = {}
+
+    return name_dict
+
+
+def text_normalize(text):
+    # todo: eng text normalize
+
+    # 效果相同，和 chinese.py 保持一致
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    text = unicode(text)
+    text = normalize(text)
+
+    # 避免重复标点引起的参考泄露
+    text = replace_consecutive_punctuation(text)
+    return text
+
+
+class en_G2p(G2p):
+    def __init__(self):
+        super().__init__()
+        # 分词初始化
+        wordsegment.load()
+
+        # 扩展过时字典, 添加姓名字典
+        self.cmu = get_dict()
+        self.namedict = get_namedict()
+
+        # 剔除读音错误的几个缩写
+        for word in ["AE", "AI", "AR", "IOS", "HUD", "OS"]:
+            del self.cmu[word.lower()]
+
+        # 修正多音字
+        self.homograph2features["read"] = (["R", "IY1", "D"], ["R", "EH1", "D"], "VBP")
+        self.homograph2features["complex"] = (
+            ["K", "AH0", "M", "P", "L", "EH1", "K", "S"],
+            ["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
+            "JJ",
+        )
+
+    def __call__(self, text):
+        # tokenization
+        words = word_tokenize(text)
+        tokens = pos_tag(words)  # tuples of (word, tag)
+
+        # steps
+        prons = []
+        for o_word, pos in tokens:
+            # 还原 g2p_en 小写操作逻辑
+            word = o_word.lower()
+
+            if re.search("[a-z]", word) is None:
+                pron = [word]
+            # 先把单字母推出去
+            elif len(word) == 1:
+                # 单读 A 发音修正, 这里需要原格式 o_word 判断大写
+                if o_word == "A":
+                    pron = ["EY1"]
+                else:
+                    pron = self.cmu[word][0]
+            # g2p_en 原版多音字处理
+            elif word in self.homograph2features:  # Check homograph
+                pron1, pron2, pos1 = self.homograph2features[word]
+                if pos.startswith(pos1):
+                    pron = pron1
+                # pos1比pos长仅出现在read
+                elif len(pos) < len(pos1) and pos == pos1[: len(pos)]:
+                    pron = pron1
+                else:
+                    pron = pron2
+            else:
+                # 递归查找预测
+                pron = self.qryword(o_word)
+
+            prons.extend(pron)
+            prons.extend([" "])
+
+        return prons[:-1]
+
+    def qryword(self, o_word):
+        word = o_word.lower()
+
+        # 查字典, 单字母除外
+        if len(word) > 1 and word in self.cmu:  # lookup CMU dict
+            return self.cmu[word][0]
+
+        # 单词仅首字母大写时查找姓名字典
+        if o_word.istitle() and word in self.namedict:
+            return self.namedict[word][0]
+
+        # oov 长度小于等于 3 直接读字母
+        if len(word) <= 3:
+            phones = []
+            for w in word:
+                # 单读 A 发音修正, 此处不存在大写的情况
+                if w == "a":
+                    phones.extend(["EY1"])
+                elif not w.isalpha():
+                    phones.extend([w])
+                else:
+                    phones.extend(self.cmu[w][0])
+            return phones
+
+        # 尝试分离所有格
+        if re.match(r"^([a-z]+)('s)$", word):
+            phones = self.qryword(word[:-2])[:]
+            # P T K F TH HH 无声辅音结尾 's 发 ['S']
+            if phones[-1] in ["P", "T", "K", "F", "TH", "HH"]:
+                phones.extend(["S"])
+            # S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
+            elif phones[-1] in ["S", "Z", "SH", "ZH", "CH", "JH"]:
+                phones.extend(["AH0", "Z"])
+            # B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
+            # AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
+            # ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
+            else:
+                phones.extend(["Z"])
+            return phones
+
+        # 尝试进行分词，应对复合词
+        comps = wordsegment.segment(word.lower())
+
+        # 无法分词的送回去预测
+        if len(comps) == 1:
+            return self.predict(word)
+
+        # 可以分词的递归处理
+        return [phone for comp in comps for phone in self.qryword(comp)]
+
+
+_g2p = en_G2p()
+
+
+def g2p(text):
+    # g2p_en 整段推理，剔除不存在的arpa返回
+    phone_list = _g2p(text)
+    phones = [ph if ph != "<unk>" else "UNK" for ph in phone_list if ph not in [" ", "<pad>", "UW", "</s>", "<s>"]]
+
+    return replace_phs(phones)
+
+
+if __name__ == "__main__":
+    print(g2p("hello"))
+    print(g2p(text_normalize("e.g. I used openai's AI tool to draw a picture.")))
+    print(g2p(text_normalize("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")))
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/init.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/init.py
@@ -0,0 +1 @@
+from text.g2pw.g2pw import *
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/dataset.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/dataset.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Credits
+    This code is modified from https://github.com/GitYCC/g2pW
+"""
+
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import numpy as np
+
+from .utils import tokenize_and_map
+
+ANCHOR_CHAR = "▁"
+
+
+def prepare_onnx_input(
+    tokenizer,
+    labels: List[str],
+    char2phonemes: Dict[str, List[int]],
+    chars: List[str],
+    texts: List[str],
+    query_ids: List[int],
+    use_mask: bool = False,
+    window_size: int = None,
+    max_len: int = 512,
+) -> Dict[str, np.array]:
+    if window_size is not None:
+        truncated_texts, truncated_query_ids = _truncate_texts(
+            window_size=window_size, texts=texts, query_ids=query_ids
+        )
+    input_ids = []
+    token_type_ids = []
+    attention_masks = []
+    phoneme_masks = []
+    char_ids = []
+    position_ids = []
+
+    for idx in range(len(texts)):
+        text = (truncated_texts if window_size else texts)[idx].lower()
+        query_id = (truncated_query_ids if window_size else query_ids)[idx]
+
+        try:
+            tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
+        except Exception:
+            print(f'warning: text "{text}" is invalid')
+            return {}
+
+        text, query_id, tokens, text2token, token2text = _truncate(
+            max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
+        )
+
+        processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
+
+        input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
+        token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
+        attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
+
+        query_char = text[query_id]
+        phoneme_mask = (
+            [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
+        )
+        char_id = chars.index(query_char)
+        position_id = text2token[query_id] + 1  # [CLS] token locate at first place
+
+        input_ids.append(input_id)
+        token_type_ids.append(token_type_id)
+        attention_masks.append(attention_mask)
+        phoneme_masks.append(phoneme_mask)
+        char_ids.append(char_id)
+        position_ids.append(position_id)
+
+    outputs = {
+        "input_ids": np.array(input_ids).astype(np.int64),
+        "token_type_ids": np.array(token_type_ids).astype(np.int64),
+        "attention_masks": np.array(attention_masks).astype(np.int64),
+        "phoneme_masks": np.array(phoneme_masks).astype(np.float32),
+        "char_ids": np.array(char_ids).astype(np.int64),
+        "position_ids": np.array(position_ids).astype(np.int64),
+    }
+    return outputs
+
+
+def _truncate_texts(window_size: int, texts: List[str], query_ids: List[int]) -> Tuple[List[str], List[int]]:
+    truncated_texts = []
+    truncated_query_ids = []
+    for text, query_id in zip(texts, query_ids):
+        start = max(0, query_id - window_size // 2)
+        end = min(len(text), query_id + window_size // 2)
+        truncated_text = text[start:end]
+        truncated_texts.append(truncated_text)
+
+        truncated_query_id = query_id - start
+        truncated_query_ids.append(truncated_query_id)
+    return truncated_texts, truncated_query_ids
+
+
+def _truncate(
+    max_len: int, text: str, query_id: int, tokens: List[str], text2token: List[int], token2text: List[Tuple[int]]
+):
+    truncate_len = max_len - 2
+    if len(tokens) <= truncate_len:
+        return (text, query_id, tokens, text2token, token2text)
+
+    token_position = text2token[query_id]
+
+    token_start = token_position - truncate_len // 2
+    token_end = token_start + truncate_len
+    font_exceed_dist = -token_start
+    back_exceed_dist = token_end - len(tokens)
+    if font_exceed_dist > 0:
+        token_start += font_exceed_dist
+        token_end += font_exceed_dist
+    elif back_exceed_dist > 0:
+        token_start -= back_exceed_dist
+        token_end -= back_exceed_dist
+
+    start = token2text[token_start][0]
+    end = token2text[token_end - 1][1]
+
+    return (
+        text[start:end],
+        query_id - start,
+        tokens[token_start:token_end],
+        [i - token_start if i is not None else None for i in text2token[start:end]],
+        [(s - start, e - start) for s, e in token2text[token_start:token_end]],
+    )
+
+
+def get_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
+    labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
+    char2phonemes = {}
+    for char, phoneme in polyphonic_chars:
+        if char not in char2phonemes:
+            char2phonemes[char] = []
+        char2phonemes[char].append(labels.index(phoneme))
+    return labels, char2phonemes
+
+
+def get_char_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
+    labels = sorted(list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars])))
+    char2phonemes = {}
+    for char, phoneme in polyphonic_chars:
+        if char not in char2phonemes:
+            char2phonemes[char] = []
+        char2phonemes[char].append(labels.index(f"{char} {phoneme}"))
+    return labels, char2phonemes
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/g2pw.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/g2pw.py
@@ -0,0 +1,159 @@
+# This code is modified from https://github.com/mozillazg/pypinyin-g2pW
+
+import pickle
+import os
+
+from pypinyin.constants import RE_HANS
+from pypinyin.core import Pinyin, Style
+from pypinyin.seg.simpleseg import simple_seg
+from pypinyin.converter import UltimateConverter
+from pypinyin.contrib.tone_convert import to_tone
+from .onnx_api import G2PWOnnxConverter
+
+current_file_path = os.path.dirname(__file__)
+CACHE_PATH = os.path.join(current_file_path, "polyphonic.pickle")
+PP_DICT_PATH = os.path.join(current_file_path, "polyphonic.rep")
+PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep")
+
+
+class G2PWPinyin(Pinyin):
+    def __init__(
+        self,
+        model_dir="G2PWModel/",
+        model_source=None,
+        enable_non_tradional_chinese=True,
+        v_to_u=False,
+        neutral_tone_with_five=False,
+        tone_sandhi=False,
+        **kwargs,
+    ):
+        self._g2pw = G2PWOnnxConverter(
+            model_dir=model_dir,
+            style="pinyin",
+            model_source=model_source,
+            enable_non_tradional_chinese=enable_non_tradional_chinese,
+        )
+        self._converter = Converter(
+            self._g2pw,
+            v_to_u=v_to_u,
+            neutral_tone_with_five=neutral_tone_with_five,
+            tone_sandhi=tone_sandhi,
+        )
+
+    def get_seg(self, **kwargs):
+        return simple_seg
+
+
+class Converter(UltimateConverter):
+    def __init__(self, g2pw_instance, v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs):
+        super(Converter, self).__init__(
+            v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, **kwargs
+        )
+
+        self._g2pw = g2pw_instance
+
+    def convert(self, words, style, heteronym, errors, strict, **kwargs):
+        pys = []
+        if RE_HANS.match(words):
+            pys = self._to_pinyin(words, style=style, heteronym=heteronym, errors=errors, strict=strict)
+            post_data = self.post_pinyin(words, heteronym, pys)
+            if post_data is not None:
+                pys = post_data
+
+            pys = self.convert_styles(pys, words, style, heteronym, errors, strict)
+
+        else:
+            py = self.handle_nopinyin(words, style=style, errors=errors, heteronym=heteronym, strict=strict)
+            if py:
+                pys.extend(py)
+
+        return _remove_dup_and_empty(pys)
+
+    def _to_pinyin(self, han, style, heteronym, errors, strict, **kwargs):
+        pinyins = []
+
+        g2pw_pinyin = self._g2pw(han)
+
+        if not g2pw_pinyin:  # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
+            return super(Converter, self).convert(han, Style.TONE, heteronym, errors, strict, **kwargs)
+
+        for i, item in enumerate(g2pw_pinyin[0]):
+            if item is None:  # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
+                py = super(Converter, self).convert(han[i], Style.TONE, heteronym, errors, strict, **kwargs)
+                pinyins.extend(py)
+            else:
+                pinyins.append([to_tone(item)])
+
+        return pinyins
+
+
+def _remove_dup_items(lst, remove_empty=False):
+    new_lst = []
+    for item in lst:
+        if remove_empty and not item:
+            continue
+        if item not in new_lst:
+            new_lst.append(item)
+    return new_lst
+
+
+def _remove_dup_and_empty(lst_list):
+    new_lst_list = []
+    for lst in lst_list:
+        lst = _remove_dup_items(lst, remove_empty=True)
+        if lst:
+            new_lst_list.append(lst)
+        else:
+            new_lst_list.append([""])
+
+    return new_lst_list
+
+
+def cache_dict(polyphonic_dict, file_path):
+    with open(file_path, "wb") as pickle_file:
+        pickle.dump(polyphonic_dict, pickle_file)
+
+
+def get_dict():
+    if os.path.exists(CACHE_PATH):
+        with open(CACHE_PATH, "rb") as pickle_file:
+            polyphonic_dict = pickle.load(pickle_file)
+    else:
+        polyphonic_dict = read_dict()
+        cache_dict(polyphonic_dict, CACHE_PATH)
+
+    return polyphonic_dict
+
+
+def read_dict():
+    polyphonic_dict = {}
+    with open(PP_DICT_PATH, encoding="utf-8") as f:
+        line = f.readline()
+        while line:
+            key, value_str = line.split(":")
+            value = eval(value_str.strip())
+            polyphonic_dict[key.strip()] = value
+            line = f.readline()
+    with open(PP_FIX_DICT_PATH, encoding="utf-8") as f:
+        line = f.readline()
+        while line:
+            key, value_str = line.split(":")
+            value = eval(value_str.strip())
+            polyphonic_dict[key.strip()] = value
+            line = f.readline()
+    return polyphonic_dict
+
+
+def correct_pronunciation(word, word_pinyins):
+    new_pinyins = pp_dict.get(word, "")
+    if new_pinyins == "":
+        for idx, w in enumerate(word):
+            w_pinyin = pp_dict.get(w, "")
+            if w_pinyin != "":
+                word_pinyins[idx] = w_pinyin[0]
+        return word_pinyins
+    else:
+        return new_pinyins
+
+
+pp_dict = get_dict()
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/onnx_api.py
@@ -0,0 +1,247 @@
+# This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw
+# This code is modified from https://github.com/GitYCC/g2pW
+
+import json
+import os
+import warnings
+import zipfile
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+import onnxruntime
+import requests
+import torch
+from opencc import OpenCC
+from pypinyin import Style, pinyin
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+
+from ..zh_normalization.char_convert import tranditional_to_simplified
+from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input
+from .utils import load_config
+
+onnxruntime.set_default_logger_severity(3)
+try:
+    onnxruntime.preload_dlls()
+except:
+    pass
+    # traceback.print_exc()
+warnings.filterwarnings("ignore")
+
+model_version = "1.1"
+
+
+def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[List[str], List[float]]:
+    all_preds = []
+    all_confidences = []
+    probs = session.run(
+        [],
+        {
+            "input_ids": onnx_input["input_ids"],
+            "token_type_ids": onnx_input["token_type_ids"],
+            "attention_mask": onnx_input["attention_masks"],
+            "phoneme_mask": onnx_input["phoneme_masks"],
+            "char_ids": onnx_input["char_ids"],
+            "position_ids": onnx_input["position_ids"],
+        },
+    )[0]
+
+    preds = np.argmax(probs, axis=1).tolist()
+    max_probs = []
+    for index, arr in zip(preds, probs.tolist()):
+        max_probs.append(arr[index])
+    all_preds += [labels[pred] for pred in preds]
+    all_confidences += max_probs
+
+    return all_preds, all_confidences
+
+
+def download_and_decompress(model_dir: str = "G2PWModel/"):
+    if not os.path.exists(model_dir):
+        parent_directory = os.path.dirname(model_dir)
+        zip_dir = os.path.join(parent_directory, "G2PWModel_1.1.zip")
+        extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
+        extract_dir_new = os.path.join(parent_directory, "G2PWModel")
+        print("Downloading g2pw model...")
+        modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"  # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
+        with requests.get(modelscope_url, stream=True) as r:
+            r.raise_for_status()
+            with open(zip_dir, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+
+        print("Extracting g2pw model...")
+        with zipfile.ZipFile(zip_dir, "r") as zip_ref:
+            zip_ref.extractall(parent_directory)
+
+        os.rename(extract_dir, extract_dir_new)
+
+    return model_dir
+
+
+class G2PWOnnxConverter:
+    def __init__(
+        self,
+        model_dir: str = "G2PWModel/",
+        style: str = "bopomofo",
+        model_source: str = None,
+        enable_non_tradional_chinese: bool = False,
+    ):
+        uncompress_path = download_and_decompress(model_dir)
+
+        sess_options = onnxruntime.SessionOptions()
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+        sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0
+        if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
+            self.session_g2pW = onnxruntime.InferenceSession(
+                os.path.join(uncompress_path, "g2pW.onnx"),
+                sess_options=sess_options,
+                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+            )
+        else:
+            self.session_g2pW = onnxruntime.InferenceSession(
+                os.path.join(uncompress_path, "g2pW.onnx"),
+                sess_options=sess_options,
+                providers=["CPUExecutionProvider"],
+            )
+        self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True)
+
+        self.model_source = model_source if model_source else self.config.model_source
+        self.enable_opencc = enable_non_tradional_chinese
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
+
+        polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt")
+        monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt")
+        self.polyphonic_chars = [
+            line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
+        ]
+        self.non_polyphonic = {
+            "一",
+            "不",
+            "和",
+            "咋",
+            "嗲",
+            "剖",
+            "差",
+            "攢",
+            "倒",
+            "難",
+            "奔",
+            "勁",
+            "拗",
+            "肖",
+            "瘙",
+            "誒",
+            "泊",
+            "听",
+            "噢",
+        }
+        self.non_monophonic = {"似", "攢"}
+        self.monophonic_chars = [
+            line.split("\t") for line in open(monophonic_chars_path, encoding="utf-8").read().strip().split("\n")
+        ]
+        self.labels, self.char2phonemes = (
+            get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
+            if self.config.use_char_phoneme
+            else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
+        )
+
+        self.chars = sorted(list(self.char2phonemes.keys()))
+
+        self.polyphonic_chars_new = set(self.chars)
+        for char in self.non_polyphonic:
+            if char in self.polyphonic_chars_new:
+                self.polyphonic_chars_new.remove(char)
+
+        self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
+        for char in self.non_monophonic:
+            if char in self.monophonic_chars_dict:
+                self.monophonic_chars_dict.pop(char)
+
+        self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
+
+        with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr:
+            self.bopomofo_convert_dict = json.load(fr)
+        self.style_convert_func = {
+            "bopomofo": lambda x: x,
+            "pinyin": self._convert_bopomofo_to_pinyin,
+        }[style]
+
+        with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr:
+            self.char_bopomofo_dict = json.load(fr)
+
+        if self.enable_opencc:
+            self.cc = OpenCC("s2tw")
+
+    def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
+        tone = bopomofo[-1]
+        assert tone in "12345"
+        component = self.bopomofo_convert_dict.get(bopomofo[:-1])
+        if component:
+            return component + tone
+        else:
+            print(f'Warning: "{bopomofo}" cannot convert to pinyin')
+            return None
+
+    def __call__(self, sentences: List[str]) -> List[List[str]]:
+        if isinstance(sentences, str):
+            sentences = [sentences]
+
+        if self.enable_opencc:
+            translated_sentences = []
+            for sent in sentences:
+                translated_sent = self.cc.convert(sent)
+                assert len(translated_sent) == len(sent)
+                translated_sentences.append(translated_sent)
+            sentences = translated_sentences
+
+        texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
+        if len(texts) == 0:
+            # sentences no polyphonic words
+            return partial_results
+
+        onnx_input = prepare_onnx_input(
+            tokenizer=self.tokenizer,
+            labels=self.labels,
+            char2phonemes=self.char2phonemes,
+            chars=self.chars,
+            texts=texts,
+            query_ids=query_ids,
+            use_mask=self.config.use_mask,
+            window_size=None,
+        )
+
+        preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels)
+        if self.config.use_char_phoneme:
+            preds = [pred.split(" ")[1] for pred in preds]
+
+        results = partial_results
+        for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
+            results[sent_id][query_id] = self.style_convert_func(pred)
+
+        return results
+
+    def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
+        texts, query_ids, sent_ids, partial_results = [], [], [], []
+        for sent_id, sent in enumerate(sentences):
+            # pypinyin works well for Simplified Chinese than Traditional Chinese
+            sent_s = tranditional_to_simplified(sent)
+            pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
+            partial_result = [None] * len(sent)
+            for i, char in enumerate(sent):
+                if char in self.polyphonic_chars_new:
+                    texts.append(sent)
+                    query_ids.append(i)
+                    sent_ids.append(sent_id)
+                elif char in self.monophonic_chars_dict:
+                    partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
+                elif char in self.char_bopomofo_dict:
+                    partial_result[i] = pypinyin_result[i][0]
+                    # partial_result[i] =  self.style_convert_func(self.char_bopomofo_dict[char][0])
+                else:
+                    partial_result[i] = pypinyin_result[i][0]
+
+            partial_results.append(partial_result)
+        return texts, query_ids, sent_ids, partial_results
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/polyphonic-fix.rep
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/polyphonic-fix.rep
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/polyphonic.pickle
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/polyphonic.pickle
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/polyphonic.rep
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/polyphonic.rep
@@ -0,0 +1,53 @@
+湖泊: ['hu2','po1']
+地壳: ['di4','qiao4']
+柏树: ['bai3','shu4']
+曝光: ['bao4','guang1']
+弹力: ['tan2','li4']
+字帖: ['zi4','tie4']
+口吃: ['kou3','chi1']
+包扎: ['bao1','za1']
+哪吒: ['ne2','zha1']
+说服: ['shuo1','fu2']
+识字: ['shi2','zi4']
+骨头: ['gu3','tou5']
+对称: ['dui4','chen4']
+口供: ['kou3','gong4']
+抹布: ['ma1','bu4']
+露背: ['lu4','bei4']
+圈养: ['juan4', 'yang3']
+眼眶: ['yan3', 'kuang4']
+品行: ['pin3','xing2']
+颤抖: ['chan4','dou3']
+差不多: ['cha4','bu5','duo1']
+鸭绿江: ['ya1','lu4','jiang1']
+撒切尔: ['sa4','qie4','er3']
+比比皆是: ['bi3','bi3','jie1','shi4']
+身无长物: ['shen1','wu2','chang2','wu4']
+手里: ['shou2','li3']
+关卡: ['guan1','qia3']
+怀揣: ['huai2','chuai1']
+挑剔: ['tiao1','ti4']
+供称: ['gong4','cheng1']
+作坊: ['zuo1', 'fang5']
+中医: ['zhong1','yi1']
+嚷嚷: ['rang1','rang5']
+商厦: ['shang1','sha4']
+大厦: ['da4','sha4']
+刹车: ['sha1','che1']
+嘚瑟: ['de4','se5']
+朝鲜: ['chao2','xian3']
+阿房宫: ['e1','pang2','gong1']
+阿胶: ['e1','jiao1']
+咖喱: ['ga1','li5']
+时分: ['shi2','fen1']
+蚌埠: ['beng4','bu4']
+驯服: ['xun4','fu2']
+幸免于难: ['xing4','mian3','yu2','nan4']
+恶行: ['e4','xing2']
+唉: ['ai4']
+扎实: ['zha1','shi2']
+干将: ['gan4','jiang4']
+陈威行: ['chen2', 'wei1', 'hang2']
+郭晟: ['guo1', 'sheng4']
+中标: ['zhong4', 'biao1']
+抗住: ['kang2', 'zhu4']
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/utils.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/g2pw/utils.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Credits
+    This code is modified from https://github.com/GitYCC/g2pW
+"""
+
+import os
+import re
+
+
+def wordize_and_map(text: str):
+    words = []
+    index_map_from_text_to_word = []
+    index_map_from_word_to_text = []
+    while len(text) > 0:
+        match_space = re.match(r"^ +", text)
+        if match_space:
+            space_str = match_space.group(0)
+            index_map_from_text_to_word += [None] * len(space_str)
+            text = text[len(space_str) :]
+            continue
+
+        match_en = re.match(r"^[a-zA-Z0-9]+", text)
+        if match_en:
+            en_word = match_en.group(0)
+
+            word_start_pos = len(index_map_from_text_to_word)
+            word_end_pos = word_start_pos + len(en_word)
+            index_map_from_word_to_text.append((word_start_pos, word_end_pos))
+
+            index_map_from_text_to_word += [len(words)] * len(en_word)
+
+            words.append(en_word)
+            text = text[len(en_word) :]
+        else:
+            word_start_pos = len(index_map_from_text_to_word)
+            word_end_pos = word_start_pos + 1
+            index_map_from_word_to_text.append((word_start_pos, word_end_pos))
+
+            index_map_from_text_to_word += [len(words)]
+
+            words.append(text[0])
+            text = text[1:]
+    return words, index_map_from_text_to_word, index_map_from_word_to_text
+
+
+def tokenize_and_map(tokenizer, text: str):
+    words, text2word, word2text = wordize_and_map(text=text)
+
+    tokens = []
+    index_map_from_token_to_text = []
+    for word, (word_start, word_end) in zip(words, word2text):
+        word_tokens = tokenizer.tokenize(word)
+
+        if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
+            index_map_from_token_to_text.append((word_start, word_end))
+            tokens.append("[UNK]")
+        else:
+            current_word_start = word_start
+            for word_token in word_tokens:
+                word_token_len = len(re.sub(r"^##", "", word_token))
+                index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len))
+                current_word_start = current_word_start + word_token_len
+                tokens.append(word_token)
+
+    index_map_from_text_to_token = text2word
+    for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):
+        for token_pos in range(token_start, token_end):
+            index_map_from_text_to_token[token_pos] = i
+
+    return tokens, index_map_from_text_to_token, index_map_from_token_to_text
+
+
+def _load_config(config_path: os.PathLike):
+    import importlib.util
+
+    spec = importlib.util.spec_from_file_location("__init__", config_path)
+    config = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(config)
+    return config
+
+
+default_config_dict = {
+    "manual_seed": 1313,
+    "model_source": "bert-base-chinese",
+    "window_size": 32,
+    "num_workers": 2,
+    "use_mask": True,
+    "use_char_phoneme": False,
+    "use_conditional": True,
+    "param_conditional": {
+        "affect_location": "softmax",
+        "bias": True,
+        "char-linear": True,
+        "pos-linear": False,
+        "char+pos-second": True,
+        "char+pos-second_lowrank": False,
+        "lowrank_size": 0,
+        "char+pos-second_fm": False,
+        "fm_size": 0,
+        "fix_mode": None,
+        "count_json": "train.count.json",
+    },
+    "lr": 5e-5,
+    "val_interval": 200,
+    "num_iter": 10000,
+    "use_focal": False,
+    "param_focal": {"alpha": 0.0, "gamma": 0.7},
+    "use_pos": True,
+    "param_pos ": {
+        "weight": 0.1,
+        "pos_joint_training": True,
+        "train_pos_path": "train.pos",
+        "valid_pos_path": "dev.pos",
+        "test_pos_path": "test.pos",
+    },
+}
+
+
+def load_config(config_path: os.PathLike, use_default: bool = False):
+    config = _load_config(config_path)
+    if use_default:
+        for attr, val in default_config_dict.items():
+            if not hasattr(config, attr):
+                setattr(config, attr, val)
+            elif isinstance(val, dict):
+                d = getattr(config, attr)
+                for dict_k, dict_v in val.items():
+                    if dict_k not in d:
+                        d[dict_k] = dict_v
+    return config
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/ja_userdic/userdict.csv
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/ja_userdic/userdict.csv
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/japanese.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/japanese.py
@@ -0,0 +1,276 @@
+# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
+import re
+import os
+import hashlib
+
+try:
+    import pyopenjtalk
+
+    current_file_path = os.path.dirname(__file__)
+
+    # 防止win下无法读取模型
+    if os.name == "nt":
+        python_dir = os.getcwd()
+        OPEN_JTALK_DICT_DIR = pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8")
+        if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", OPEN_JTALK_DICT_DIR)):
+            if OPEN_JTALK_DICT_DIR[: len(python_dir)].upper() == python_dir.upper():
+                OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR, python_dir))
+            else:
+                import shutil
+
+                if not os.path.exists("TEMP"):
+                    os.mkdir("TEMP")
+                if not os.path.exists(os.path.join("TEMP", "ja")):
+                    os.mkdir(os.path.join("TEMP", "ja"))
+                if os.path.exists(os.path.join("TEMP", "ja", "open_jtalk_dic")):
+                    shutil.rmtree(os.path.join("TEMP", "ja", "open_jtalk_dic"))
+                shutil.copytree(
+                    pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"),
+                    os.path.join("TEMP", "ja", "open_jtalk_dic"),
+                )
+                OPEN_JTALK_DICT_DIR = os.path.join("TEMP", "ja", "open_jtalk_dic")
+            pyopenjtalk.OPEN_JTALK_DICT_DIR = OPEN_JTALK_DICT_DIR.encode("utf-8")
+
+        if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", current_file_path)):
+            if current_file_path[: len(python_dir)].upper() == python_dir.upper():
+                current_file_path = os.path.join(os.path.relpath(current_file_path, python_dir))
+            else:
+                if not os.path.exists("TEMP"):
+                    os.mkdir("TEMP")
+                if not os.path.exists(os.path.join("TEMP", "ja")):
+                    os.mkdir(os.path.join("TEMP", "ja"))
+                if not os.path.exists(os.path.join("TEMP", "ja", "ja_userdic")):
+                    os.mkdir(os.path.join("TEMP", "ja", "ja_userdic"))
+                    shutil.copyfile(
+                        os.path.join(current_file_path, "ja_userdic", "userdict.csv"),
+                        os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"),
+                    )
+                current_file_path = os.path.join("TEMP", "ja")
+
+    def get_hash(fp: str) -> str:
+        hash_md5 = hashlib.md5()
+        with open(fp, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+
+    USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
+    USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
+    USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
+    # 如果没有用户词典，就生成一个；如果有，就检查md5，如果不一样，就重新生成
+    if os.path.exists(USERDIC_CSV_PATH):
+        if (
+            not os.path.exists(USERDIC_BIN_PATH)
+            or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read()
+        ):
+            pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
+            with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f:
+                f.write(get_hash(USERDIC_CSV_PATH))
+
+    if os.path.exists(USERDIC_BIN_PATH):
+        pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
+except Exception:
+    # print(e)
+    import pyopenjtalk
+
+    # failed to load user dictionary, ignore.
+    pass
+
+
+from text.symbols import punctuation
+
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
+)
+
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]
+
+
+# List of (consonant, sokuon) pairs:
+_real_sokuon = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        (r"Q([↑↓]*[kg])", r"k#\1"),
+        (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
+        (r"Q([↑↓]*[sʃ])", r"s\1"),
+        (r"Q([↑↓]*[pb])", r"p#\1"),
+    ]
+]
+
+# List of (consonant, hatsuon) pairs:
+_real_hatsuon = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        (r"N([↑↓]*[pbm])", r"m\1"),
+        (r"N([↑↓]*[ʧʥj])", r"n^\1"),
+        (r"N([↑↓]*[tdn])", r"n\1"),
+        (r"N([↑↓]*[kg])", r"ŋ\1"),
+    ]
+]
+
+
+def post_replace_ph(ph):
+    rep_map = {
+        "：": ",",
+        "；": ",",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "·": ",",
+        "、": ",",
+        "...": "…",
+    }
+
+    if ph in rep_map.keys():
+        ph = rep_map[ph]
+    return ph
+
+
+def replace_consecutive_punctuation(text):
+    punctuations = "".join(re.escape(p) for p in punctuation)
+    pattern = f"([{punctuations}])([{punctuations}])+"
+    result = re.sub(pattern, r"\1", text)
+    return result
+
+
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def preprocess_jap(text, with_prosody=False):
+    """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
+    text = symbols_to_japanese(text)
+    # English words to lower case, should have no influence on japanese words.
+    text = text.lower()
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = []
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if with_prosody:
+                text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
+            else:
+                p = pyopenjtalk.g2p(sentence)
+                text += p.split(" ")
+
+        if i < len(marks):
+            if marks[i] == " ":  # 防止意外的UNK
+                continue
+            text += [marks[i].replace(" ", "")]
+    return text
+
+
+def text_normalize(text):
+    # todo: jap text normalize
+
+    # 避免重复标点引起的参考泄露
+    text = replace_consecutive_punctuation(text)
+    return text
+
+
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
+    """Extract phoneme + prosoody symbol sequence from input full-context labels.
+
+    The algorithm is based on `Prosodic features control by symbols as input of
+    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
+
+    Args:
+        text (str): Input text.
+        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
+
+    Returns:
+        List[str]: List of phoneme + prosody symbols.
+
+    Examples:
+        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
+        >>> pyopenjtalk_g2p_prosody("こんにちは。")
+        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
+
+    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
+        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
+
+    """
+    labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
+    N = len(labels)
+
+    phones = []
+    for n in range(N):
+        lab_curr = labels[n]
+
+        # current phoneme
+        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
+        # deal unvoiced vowels as normal vowels
+        if drop_unvoiced_vowels and p3 in "AEIOU":
+            p3 = p3.lower()
+
+        # deal with sil at the beginning and the end of text
+        if p3 == "sil":
+            assert n == 0 or n == N - 1
+            if n == 0:
+                phones.append("^")
+            elif n == N - 1:
+                # check question form or not
+                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
+                if e3 == 0:
+                    phones.append("$")
+                elif e3 == 1:
+                    phones.append("?")
+            continue
+        elif p3 == "pau":
+            phones.append("_")
+            continue
+        else:
+            phones.append(p3)
+
+        # accent type and position info (forward or backward)
+        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
+        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
+        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
+
+        # number of mora in accent phrase
+        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
+
+        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
+        # accent phrase border
+        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
+            phones.append("#")
+        # pitch falling
+        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
+            phones.append("]")
+        # pitch rising
+        elif a2 == 1 and a2_next == 2:
+            phones.append("[")
+
+    return phones
+
+
+# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
+def _numeric_feature_by_regex(regex, s):
+    match = re.search(regex, s)
+    if match is None:
+        return -50
+    return int(match.group(1))
+
+
+def g2p(norm_text, with_prosody=True):
+    phones = preprocess_jap(norm_text, with_prosody)
+    phones = [post_replace_ph(i) for i in phones]
+    # todo: implement tones and word2ph
+    return phones
+
+
+if __name__ == "__main__":
+    phones = g2p("Hello.こんにちは！今日もNiCe天気ですね！tokyotowerに行きましょう！")
+    print(phones)
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/korean.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/korean.py
@@ -0,0 +1,337 @@
+# reference: https://github.com/ORI-Muchim/MB-iSTFT-VITS-Korean/blob/main/text/korean.py
+
+import re
+from jamo import h2j, j2hcj
+import ko_pron
+from g2pk2 import G2p
+
+import importlib
+import os
+
+# 防止win下无法读取模型
+if os.name == "nt":
+
+    class win_G2p(G2p):
+        def check_mecab(self):
+            super().check_mecab()
+            spam_spec = importlib.util.find_spec("eunjeon")
+            non_found = spam_spec is None
+            if non_found:
+                print("you have to install eunjeon. install it...")
+            else:
+                installpath = spam_spec.submodule_search_locations[0]
+                if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
+                    import sys
+                    from eunjeon import Mecab as _Mecab
+
+                    class Mecab(_Mecab):
+                        def get_dicpath(installpath):
+                            if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
+                                import shutil
+
+                                python_dir = os.getcwd()
+                                if installpath[: len(python_dir)].upper() == python_dir.upper():
+                                    dicpath = os.path.join(os.path.relpath(installpath, python_dir), "data", "mecabrc")
+                                else:
+                                    if not os.path.exists("TEMP"):
+                                        os.mkdir("TEMP")
+                                    if not os.path.exists(os.path.join("TEMP", "ko")):
+                                        os.mkdir(os.path.join("TEMP", "ko"))
+                                    if os.path.exists(os.path.join("TEMP", "ko", "ko_dict")):
+                                        shutil.rmtree(os.path.join("TEMP", "ko", "ko_dict"))
+
+                                    shutil.copytree(
+                                        os.path.join(installpath, "data"), os.path.join("TEMP", "ko", "ko_dict")
+                                    )
+                                    dicpath = os.path.join("TEMP", "ko", "ko_dict", "mecabrc")
+                            else:
+                                dicpath = os.path.abspath(os.path.join(installpath, "data/mecabrc"))
+                            return dicpath
+
+                        def __init__(self, dicpath=get_dicpath(installpath)):
+                            super().__init__(dicpath=dicpath)
+
+                    sys.modules["eunjeon"].Mecab = Mecab
+
+    G2p = win_G2p
+
+
+from text.symbols2 import symbols
+
+# This is a list of Korean classifiers preceded by pure Korean numerals.
+_korean_classifiers = (
+    "군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통"
+)
+
+# List of (hangul, hangul divided) pairs:
+_hangul_divided = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        # ('ㄳ', 'ㄱㅅ'),   # g2pk2, A Syllable-ending Rule
+        # ('ㄵ', 'ㄴㅈ'),
+        # ('ㄶ', 'ㄴㅎ'),
+        # ('ㄺ', 'ㄹㄱ'),
+        # ('ㄻ', 'ㄹㅁ'),
+        # ('ㄼ', 'ㄹㅂ'),
+        # ('ㄽ', 'ㄹㅅ'),
+        # ('ㄾ', 'ㄹㅌ'),
+        # ('ㄿ', 'ㄹㅍ'),
+        # ('ㅀ', 'ㄹㅎ'),
+        # ('ㅄ', 'ㅂㅅ'),
+        ("ㅘ", "ㅗㅏ"),
+        ("ㅙ", "ㅗㅐ"),
+        ("ㅚ", "ㅗㅣ"),
+        ("ㅝ", "ㅜㅓ"),
+        ("ㅞ", "ㅜㅔ"),
+        ("ㅟ", "ㅜㅣ"),
+        ("ㅢ", "ㅡㅣ"),
+        ("ㅑ", "ㅣㅏ"),
+        ("ㅒ", "ㅣㅐ"),
+        ("ㅕ", "ㅣㅓ"),
+        ("ㅖ", "ㅣㅔ"),
+        ("ㅛ", "ㅣㅗ"),
+        ("ㅠ", "ㅣㅜ"),
+    ]
+]
+
+# List of (Latin alphabet, hangul) pairs:
+_latin_to_hangul = [
+    (re.compile("%s" % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("a", "에이"),
+        ("b", "비"),
+        ("c", "시"),
+        ("d", "디"),
+        ("e", "이"),
+        ("f", "에프"),
+        ("g", "지"),
+        ("h", "에이치"),
+        ("i", "아이"),
+        ("j", "제이"),
+        ("k", "케이"),
+        ("l", "엘"),
+        ("m", "엠"),
+        ("n", "엔"),
+        ("o", "오"),
+        ("p", "피"),
+        ("q", "큐"),
+        ("r", "아르"),
+        ("s", "에스"),
+        ("t", "티"),
+        ("u", "유"),
+        ("v", "브이"),
+        ("w", "더블유"),
+        ("x", "엑스"),
+        ("y", "와이"),
+        ("z", "제트"),
+    ]
+]
+
+# List of (ipa, lazy ipa) pairs:
+_ipa_to_lazy_ipa = [
+    (re.compile("%s" % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("t͡ɕ", "ʧ"),
+        ("d͡ʑ", "ʥ"),
+        ("ɲ", "n^"),
+        ("ɕ", "ʃ"),
+        ("ʷ", "w"),
+        ("ɭ", "l`"),
+        ("ʎ", "ɾ"),
+        ("ɣ", "ŋ"),
+        ("ɰ", "ɯ"),
+        ("ʝ", "j"),
+        ("ʌ", "ə"),
+        ("ɡ", "g"),
+        ("\u031a", "#"),
+        ("\u0348", "="),
+        ("\u031e", ""),
+        ("\u0320", ""),
+        ("\u0339", ""),
+    ]
+]
+
+
+def fix_g2pk2_error(text):
+    new_text = ""
+    i = 0
+    while i < len(text) - 4:
+        if (text[i : i + 3] == "ㅇㅡㄹ" or text[i : i + 3] == "ㄹㅡㄹ") and text[i + 3] == " " and text[i + 4] == "ㄹ":
+            new_text += text[i : i + 3] + " " + "ㄴ"
+            i += 5
+        else:
+            new_text += text[i]
+            i += 1
+
+    new_text += text[i:]
+    return new_text
+
+
+def latin_to_hangul(text):
+    for regex, replacement in _latin_to_hangul:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def divide_hangul(text):
+    text = j2hcj(h2j(text))
+    for regex, replacement in _hangul_divided:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def hangul_number(num, sino=True):
+    """Reference https://github.com/Kyubyong/g2pK"""
+    num = re.sub(",", "", num)
+
+    if num == "0":
+        return "영"
+    if not sino and num == "20":
+        return "스무"
+
+    digits = "123456789"
+    names = "일이삼사오육칠팔구"
+    digit2name = {d: n for d, n in zip(digits, names)}
+
+    modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉"
+    decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔"
+    digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
+    digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
+
+    spelledout = []
+    for i, digit in enumerate(num):
+        i = len(num) - i - 1
+        if sino:
+            if i == 0:
+                name = digit2name.get(digit, "")
+            elif i == 1:
+                name = digit2name.get(digit, "") + "십"
+                name = name.replace("일십", "십")
+        else:
+            if i == 0:
+                name = digit2mod.get(digit, "")
+            elif i == 1:
+                name = digit2dec.get(digit, "")
+        if digit == "0":
+            if i % 4 == 0:
+                last_three = spelledout[-min(3, len(spelledout)) :]
+                if "".join(last_three) == "":
+                    spelledout.append("")
+                    continue
+            else:
+                spelledout.append("")
+                continue
+        if i == 2:
+            name = digit2name.get(digit, "") + "백"
+            name = name.replace("일백", "백")
+        elif i == 3:
+            name = digit2name.get(digit, "") + "천"
+            name = name.replace("일천", "천")
+        elif i == 4:
+            name = digit2name.get(digit, "") + "만"
+            name = name.replace("일만", "만")
+        elif i == 5:
+            name = digit2name.get(digit, "") + "십"
+            name = name.replace("일십", "십")
+        elif i == 6:
+            name = digit2name.get(digit, "") + "백"
+            name = name.replace("일백", "백")
+        elif i == 7:
+            name = digit2name.get(digit, "") + "천"
+            name = name.replace("일천", "천")
+        elif i == 8:
+            name = digit2name.get(digit, "") + "억"
+        elif i == 9:
+            name = digit2name.get(digit, "") + "십"
+        elif i == 10:
+            name = digit2name.get(digit, "") + "백"
+        elif i == 11:
+            name = digit2name.get(digit, "") + "천"
+        elif i == 12:
+            name = digit2name.get(digit, "") + "조"
+        elif i == 13:
+            name = digit2name.get(digit, "") + "십"
+        elif i == 14:
+            name = digit2name.get(digit, "") + "백"
+        elif i == 15:
+            name = digit2name.get(digit, "") + "천"
+        spelledout.append(name)
+    return "".join(elem for elem in spelledout)
+
+
+def number_to_hangul(text):
+    """Reference https://github.com/Kyubyong/g2pK"""
+    tokens = set(re.findall(r"(\d[\d,]*)([\uac00-\ud71f]+)", text))
+    for token in tokens:
+        num, classifier = token
+        if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
+            spelledout = hangul_number(num, sino=False)
+        else:
+            spelledout = hangul_number(num, sino=True)
+        text = text.replace(f"{num}{classifier}", f"{spelledout}{classifier}")
+    # digit by digit for remaining digits
+    digits = "0123456789"
+    names = "영일이삼사오육칠팔구"
+    for d, n in zip(digits, names):
+        text = text.replace(d, n)
+    return text
+
+
+def korean_to_lazy_ipa(text):
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text = re.sub("[\uac00-\ud7af]+", lambda x: ko_pron.romanise(x.group(0), "ipa").split("] ~ [")[0], text)
+    for regex, replacement in _ipa_to_lazy_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+_g2p = G2p()
+
+
+def korean_to_ipa(text):
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text = _g2p(text)
+    text = fix_g2pk2_error(text)
+    text = korean_to_lazy_ipa(text)
+    return text.replace("ʧ", "tʃ").replace("ʥ", "dʑ")
+
+
+def post_replace_ph(ph):
+    rep_map = {
+        "：": ",",
+        "；": ",",
+        "，": ",",
+        "。": ".",
+        "！": "!",
+        "？": "?",
+        "\n": ".",
+        "·": ",",
+        "、": ",",
+        "...": "…",
+        " ": "空",
+    }
+    if ph in rep_map.keys():
+        ph = rep_map[ph]
+    if ph in symbols:
+        return ph
+    if ph not in symbols:
+        ph = "停"
+    return ph
+
+
+def g2p(text):
+    text = latin_to_hangul(text)
+    text = _g2p(text)
+    text = divide_hangul(text)
+    text = fix_g2pk2_error(text)
+    text = re.sub(r"([\u3131-\u3163])$", r"\1.", text)
+    # text = "".join([post_replace_ph(i) for i in text])
+    text = [post_replace_ph(i) for i in text]
+    return text
+
+
+if __name__ == "__main__":
+    text = "안녕하세요"
+    print(g2p(text))
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/namedict_cache.pickle
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/namedict_cache.pickle
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/opencpop-strict.txt
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/opencpop-strict.txt
@@ -0,0 +1,429 @@
+a	AA a
+ai	AA ai
+an	AA an
+ang	AA ang
+ao	AA ao
+ba	b a
+bai	b ai
+ban	b an
+bang	b ang
+bao	b ao
+bei	b ei
+ben	b en
+beng	b eng
+bi	b i
+bian	b ian
+biao	b iao
+bie	b ie
+bin	b in
+bing	b ing
+bo	b o
+bu	b u
+ca	c a
+cai	c ai
+can	c an
+cang	c ang
+cao	c ao
+ce	c e
+cei	c ei
+cen	c en
+ceng	c eng
+cha	ch a
+chai	ch ai
+chan	ch an
+chang	ch ang
+chao	ch ao
+che	ch e
+chen	ch en
+cheng	ch eng
+chi	ch ir
+chong	ch ong
+chou	ch ou
+chu	ch u
+chua	ch ua
+chuai	ch uai
+chuan	ch uan
+chuang	ch uang
+chui	ch ui
+chun	ch un
+chuo	ch uo
+ci	c i0
+cong	c ong
+cou	c ou
+cu	c u
+cuan	c uan
+cui	c ui
+cun	c un
+cuo	c uo
+da	d a
+dai	d ai
+dan	d an
+dang	d ang
+dao	d ao
+de	d e
+dei	d ei
+den	d en
+deng	d eng
+di	d i
+dia	d ia
+dian	d ian
+diao	d iao
+die	d ie
+ding	d ing
+diu	d iu
+dong	d ong
+dou	d ou
+du	d u
+duan	d uan
+dui	d ui
+dun	d un
+duo	d uo
+e	EE e
+ei	EE ei
+en	EE en
+eng	EE eng
+er	EE er
+fa	f a
+fan	f an
+fang	f ang
+fei	f ei
+fen	f en
+feng	f eng
+fo	f o
+fou	f ou
+fu	f u
+ga	g a
+gai	g ai
+gan	g an
+gang	g ang
+gao	g ao
+ge	g e
+gei	g ei
+gen	g en
+geng	g eng
+gong	g ong
+gou	g ou
+gu	g u
+gua	g ua
+guai	g uai
+guan	g uan
+guang	g uang
+gui	g ui
+gun	g un
+guo	g uo
+ha	h a
+hai	h ai
+han	h an
+hang	h ang
+hao	h ao
+he	h e
+hei	h ei
+hen	h en
+heng	h eng
+hong	h ong
+hou	h ou
+hu	h u
+hua	h ua
+huai	h uai
+huan	h uan
+huang	h uang
+hui	h ui
+hun	h un
+huo	h uo
+ji	j i
+jia	j ia
+jian	j ian
+jiang	j iang
+jiao	j iao
+jie	j ie
+jin	j in
+jing	j ing
+jiong	j iong
+jiu	j iu
+ju	j v
+jv	j v
+juan	j van
+jvan	j van
+jue	j ve
+jve	j ve
+jun	j vn
+jvn	j vn
+ka	k a
+kai	k ai
+kan	k an
+kang	k ang
+kao	k ao
+ke	k e
+kei	k ei
+ken	k en
+keng	k eng
+kong	k ong
+kou	k ou
+ku	k u
+kua	k ua
+kuai	k uai
+kuan	k uan
+kuang	k uang
+kui	k ui
+kun	k un
+kuo	k uo
+la	l a
+lai	l ai
+lan	l an
+lang	l ang
+lao	l ao
+le	l e
+lei	l ei
+leng	l eng
+li	l i
+lia	l ia
+lian	l ian
+liang	l iang
+liao	l iao
+lie	l ie
+lin	l in
+ling	l ing
+liu	l iu
+lo	l o
+long	l ong
+lou	l ou
+lu	l u
+luan	l uan
+lun	l un
+luo	l uo
+lv	l v
+lve	l ve
+ma	m a
+mai	m ai
+man	m an
+mang	m ang
+mao	m ao
+me	m e
+mei	m ei
+men	m en
+meng	m eng
+mi	m i
+mian	m ian
+miao	m iao
+mie	m ie
+min	m in
+ming	m ing
+miu	m iu
+mo	m o
+mou	m ou
+mu	m u
+na	n a
+nai	n ai
+nan	n an
+nang	n ang
+nao	n ao
+ne	n e
+nei	n ei
+nen	n en
+neng	n eng
+ni	n i
+nian	n ian
+niang	n iang
+niao	n iao
+nie	n ie
+nin	n in
+ning	n ing
+niu	n iu
+nong	n ong
+nou	n ou
+nu	n u
+nuan	n uan
+nun	n un
+nuo	n uo
+nv	n v
+nve	n ve
+o	OO o
+ou	OO ou
+pa	p a
+pai	p ai
+pan	p an
+pang	p ang
+pao	p ao
+pei	p ei
+pen	p en
+peng	p eng
+pi	p i
+pian	p ian
+piao	p iao
+pie	p ie
+pin	p in
+ping	p ing
+po	p o
+pou	p ou
+pu	p u
+qi	q i
+qia	q ia
+qian	q ian
+qiang	q iang
+qiao	q iao
+qie	q ie
+qin	q in
+qing	q ing
+qiong	q iong
+qiu	q iu
+qu	q v
+qv	q v
+quan	q van
+qvan	q van
+que	q ve
+qve	q ve
+qun	q vn
+qvn	q vn
+ran	r an
+rang	r ang
+rao	r ao
+re	r e
+ren	r en
+reng	r eng
+ri	r ir
+rong	r ong
+rou	r ou
+ru	r u
+rua	r ua
+ruan	r uan
+rui	r ui
+run	r un
+ruo	r uo
+sa	s a
+sai	s ai
+san	s an
+sang	s ang
+sao	s ao
+se	s e
+sen	s en
+seng	s eng
+sha	sh a
+shai	sh ai
+shan	sh an
+shang	sh ang
+shao	sh ao
+she	sh e
+shei	sh ei
+shen	sh en
+sheng	sh eng
+shi	sh ir
+shou	sh ou
+shu	sh u
+shua	sh ua
+shuai	sh uai
+shuan	sh uan
+shuang	sh uang
+shui	sh ui
+shun	sh un
+shuo	sh uo
+si	s i0
+song	s ong
+sou	s ou
+su	s u
+suan	s uan
+sui	s ui
+sun	s un
+suo	s uo
+ta	t a
+tai	t ai
+tan	t an
+tang	t ang
+tao	t ao
+te	t e
+tei	t ei
+teng	t eng
+ti	t i
+tian	t ian
+tiao	t iao
+tie	t ie
+ting	t ing
+tong	t ong
+tou	t ou
+tu	t u
+tuan	t uan
+tui	t ui
+tun	t un
+tuo	t uo
+wa	w a
+wai	w ai
+wan	w an
+wang	w ang
+wei	w ei
+wen	w en
+weng	w eng
+wo	w o
+wu	w u
+xi	x i
+xia	x ia
+xian	x ian
+xiang	x iang
+xiao	x iao
+xie	x ie
+xin	x in
+xing	x ing
+xiong	x iong
+xiu	x iu
+xu	x v
+xv	x v
+xuan	x van
+xvan	x van
+xue	x ve
+xve	x ve
+xun	x vn
+xvn	x vn
+ya	y a
+yan	y En
+yang	y ang
+yao	y ao
+ye	y E
+yi	y i
+yin	y in
+ying	y ing
+yo	y o
+yong	y ong
+you	y ou
+yu	y v
+yv	y v
+yuan	y van
+yvan	y van
+yue	y ve
+yve	y ve
+yun	y vn
+yvn	y vn
+za	z a
+zai	z ai
+zan	z an
+zang	z ang
+zao	z ao
+ze	z e
+zei	z ei
+zen	z en
+zeng	z eng
+zha	zh a
+zhai	zh ai
+zhan	zh an
+zhang	zh ang
+zhao	zh ao
+zhe	zh e
+zhei	zh ei
+zhen	zh en
+zheng	zh eng
+zhi	zh ir
+zhong	zh ong
+zhou	zh ou
+zhu	zh u
+zhua	zh ua
+zhuai	zh uai
+zhuan	zh uan
+zhuang	zh uang
+zhui	zh ui
+zhun	zh un
+zhuo	zh uo
+zi	z i0
+zong	z ong
+zou	z ou
+zu	z u
+zuan	z uan
+zui	z ui
+zun	z un
+zuo	z uo
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/symbols.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/symbols.py
@@ -0,0 +1,399 @@
+# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
+punctuation = ["!", "?", "…", ",", "."]  # @是SP停顿
+punctuation.append("-")
+pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
+# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
+pad = "_"
+
+c = [
+    "AA",
+    "EE",
+    "OO",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+]
+v = [
+    "E1",
+    "En1",
+    "a1",
+    "ai1",
+    "an1",
+    "ang1",
+    "ao1",
+    "e1",
+    "ei1",
+    "en1",
+    "eng1",
+    "er1",
+    "i1",
+    "i01",
+    "ia1",
+    "ian1",
+    "iang1",
+    "iao1",
+    "ie1",
+    "in1",
+    "ing1",
+    "iong1",
+    "ir1",
+    "iu1",
+    "o1",
+    "ong1",
+    "ou1",
+    "u1",
+    "ua1",
+    "uai1",
+    "uan1",
+    "uang1",
+    "ui1",
+    "un1",
+    "uo1",
+    "v1",
+    "van1",
+    "ve1",
+    "vn1",
+    "E2",
+    "En2",
+    "a2",
+    "ai2",
+    "an2",
+    "ang2",
+    "ao2",
+    "e2",
+    "ei2",
+    "en2",
+    "eng2",
+    "er2",
+    "i2",
+    "i02",
+    "ia2",
+    "ian2",
+    "iang2",
+    "iao2",
+    "ie2",
+    "in2",
+    "ing2",
+    "iong2",
+    "ir2",
+    "iu2",
+    "o2",
+    "ong2",
+    "ou2",
+    "u2",
+    "ua2",
+    "uai2",
+    "uan2",
+    "uang2",
+    "ui2",
+    "un2",
+    "uo2",
+    "v2",
+    "van2",
+    "ve2",
+    "vn2",
+    "E3",
+    "En3",
+    "a3",
+    "ai3",
+    "an3",
+    "ang3",
+    "ao3",
+    "e3",
+    "ei3",
+    "en3",
+    "eng3",
+    "er3",
+    "i3",
+    "i03",
+    "ia3",
+    "ian3",
+    "iang3",
+    "iao3",
+    "ie3",
+    "in3",
+    "ing3",
+    "iong3",
+    "ir3",
+    "iu3",
+    "o3",
+    "ong3",
+    "ou3",
+    "u3",
+    "ua3",
+    "uai3",
+    "uan3",
+    "uang3",
+    "ui3",
+    "un3",
+    "uo3",
+    "v3",
+    "van3",
+    "ve3",
+    "vn3",
+    "E4",
+    "En4",
+    "a4",
+    "ai4",
+    "an4",
+    "ang4",
+    "ao4",
+    "e4",
+    "ei4",
+    "en4",
+    "eng4",
+    "er4",
+    "i4",
+    "i04",
+    "ia4",
+    "ian4",
+    "iang4",
+    "iao4",
+    "ie4",
+    "in4",
+    "ing4",
+    "iong4",
+    "ir4",
+    "iu4",
+    "o4",
+    "ong4",
+    "ou4",
+    "u4",
+    "ua4",
+    "uai4",
+    "uan4",
+    "uang4",
+    "ui4",
+    "un4",
+    "uo4",
+    "v4",
+    "van4",
+    "ve4",
+    "vn4",
+    "E5",
+    "En5",
+    "a5",
+    "ai5",
+    "an5",
+    "ang5",
+    "ao5",
+    "e5",
+    "ei5",
+    "en5",
+    "eng5",
+    "er5",
+    "i5",
+    "i05",
+    "ia5",
+    "ian5",
+    "iang5",
+    "iao5",
+    "ie5",
+    "in5",
+    "ing5",
+    "iong5",
+    "ir5",
+    "iu5",
+    "o5",
+    "ong5",
+    "ou5",
+    "u5",
+    "ua5",
+    "uai5",
+    "uan5",
+    "uang5",
+    "ui5",
+    "un5",
+    "uo5",
+    "v5",
+    "van5",
+    "ve5",
+    "vn5",
+]
+
+v_without_tone = [
+    "E",
+    "En",
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "i",
+    "i0",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "in",
+    "ing",
+    "iong",
+    "ir",
+    "iu",
+    "o",
+    "ong",
+    "ou",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "ui",
+    "un",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+]
+
+# japanese
+ja_symbols = [
+    "I",
+    "N",
+    "U",
+    "a",
+    "b",
+    "by",
+    "ch",
+    "cl",
+    "d",
+    "dy",
+    "e",
+    "f",
+    "g",
+    "gy",
+    "h",
+    "hy",
+    "i",
+    "j",
+    "k",
+    "ky",
+    "m",
+    "my",
+    "n",
+    "ny",
+    "o",
+    "p",
+    "py",
+    "r",
+    "ry",
+    "s",
+    "sh",
+    "t",
+    "ts",
+    "u",
+    "v",
+    "w",
+    "y",
+    "z",
+    # "[", #上升调型
+    # "]", #下降调型
+    # "$", #结束符
+    # "^", #开始符
+]
+
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+
+symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
+symbols = sorted(set(symbols))
+if __name__ == "__main__":
+    print(len(symbols))
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/symbols2.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/symbols2.py
@@ -0,0 +1,797 @@
+# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
+punctuation = ["!", "?", "…", ",", "."]  # @是SP停顿
+punctuation.append("-")
+pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
+# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
+pad = "_"
+
+c = [
+    "AA",
+    "EE",
+    "OO",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+]
+v = [
+    "E1",
+    "En1",
+    "a1",
+    "ai1",
+    "an1",
+    "ang1",
+    "ao1",
+    "e1",
+    "ei1",
+    "en1",
+    "eng1",
+    "er1",
+    "i1",
+    "i01",
+    "ia1",
+    "ian1",
+    "iang1",
+    "iao1",
+    "ie1",
+    "in1",
+    "ing1",
+    "iong1",
+    "ir1",
+    "iu1",
+    "o1",
+    "ong1",
+    "ou1",
+    "u1",
+    "ua1",
+    "uai1",
+    "uan1",
+    "uang1",
+    "ui1",
+    "un1",
+    "uo1",
+    "v1",
+    "van1",
+    "ve1",
+    "vn1",
+    "E2",
+    "En2",
+    "a2",
+    "ai2",
+    "an2",
+    "ang2",
+    "ao2",
+    "e2",
+    "ei2",
+    "en2",
+    "eng2",
+    "er2",
+    "i2",
+    "i02",
+    "ia2",
+    "ian2",
+    "iang2",
+    "iao2",
+    "ie2",
+    "in2",
+    "ing2",
+    "iong2",
+    "ir2",
+    "iu2",
+    "o2",
+    "ong2",
+    "ou2",
+    "u2",
+    "ua2",
+    "uai2",
+    "uan2",
+    "uang2",
+    "ui2",
+    "un2",
+    "uo2",
+    "v2",
+    "van2",
+    "ve2",
+    "vn2",
+    "E3",
+    "En3",
+    "a3",
+    "ai3",
+    "an3",
+    "ang3",
+    "ao3",
+    "e3",
+    "ei3",
+    "en3",
+    "eng3",
+    "er3",
+    "i3",
+    "i03",
+    "ia3",
+    "ian3",
+    "iang3",
+    "iao3",
+    "ie3",
+    "in3",
+    "ing3",
+    "iong3",
+    "ir3",
+    "iu3",
+    "o3",
+    "ong3",
+    "ou3",
+    "u3",
+    "ua3",
+    "uai3",
+    "uan3",
+    "uang3",
+    "ui3",
+    "un3",
+    "uo3",
+    "v3",
+    "van3",
+    "ve3",
+    "vn3",
+    "E4",
+    "En4",
+    "a4",
+    "ai4",
+    "an4",
+    "ang4",
+    "ao4",
+    "e4",
+    "ei4",
+    "en4",
+    "eng4",
+    "er4",
+    "i4",
+    "i04",
+    "ia4",
+    "ian4",
+    "iang4",
+    "iao4",
+    "ie4",
+    "in4",
+    "ing4",
+    "iong4",
+    "ir4",
+    "iu4",
+    "o4",
+    "ong4",
+    "ou4",
+    "u4",
+    "ua4",
+    "uai4",
+    "uan4",
+    "uang4",
+    "ui4",
+    "un4",
+    "uo4",
+    "v4",
+    "van4",
+    "ve4",
+    "vn4",
+    "E5",
+    "En5",
+    "a5",
+    "ai5",
+    "an5",
+    "ang5",
+    "ao5",
+    "e5",
+    "ei5",
+    "en5",
+    "eng5",
+    "er5",
+    "i5",
+    "i05",
+    "ia5",
+    "ian5",
+    "iang5",
+    "iao5",
+    "ie5",
+    "in5",
+    "ing5",
+    "iong5",
+    "ir5",
+    "iu5",
+    "o5",
+    "ong5",
+    "ou5",
+    "u5",
+    "ua5",
+    "uai5",
+    "uan5",
+    "uang5",
+    "ui5",
+    "un5",
+    "uo5",
+    "v5",
+    "van5",
+    "ve5",
+    "vn5",
+]
+
+v_without_tone = [
+    "E",
+    "En",
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "i",
+    "i0",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "in",
+    "ing",
+    "iong",
+    "ir",
+    "iu",
+    "o",
+    "ong",
+    "ou",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "ui",
+    "un",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+]
+
+# japanese
+ja_symbols = [
+    "I",
+    "N",
+    "U",
+    "a",
+    "b",
+    "by",
+    "ch",
+    "cl",
+    "d",
+    "dy",
+    "e",
+    "f",
+    "g",
+    "gy",
+    "h",
+    "hy",
+    "i",
+    "j",
+    "k",
+    "ky",
+    "m",
+    "my",
+    "n",
+    "ny",
+    "o",
+    "p",
+    "py",
+    "r",
+    "ry",
+    "s",
+    "sh",
+    "t",
+    "ts",
+    "u",
+    "v",
+    "w",
+    "y",
+    "z",
+    ###楼下2个留到后面加
+    # "[", #上升调型
+    # "]", #下降调型
+    # "$", #结束符
+    # "^", #开始符
+]
+
+arpa = {
+    "AH0",
+    "S",
+    "AH1",
+    "EY2",
+    "AE2",
+    "EH0",
+    "OW2",
+    "UH0",
+    "NG",
+    "B",
+    "G",
+    "AY0",
+    "M",
+    "AA0",
+    "F",
+    "AO0",
+    "ER2",
+    "UH1",
+    "IY1",
+    "AH2",
+    "DH",
+    "IY0",
+    "EY1",
+    "IH0",
+    "K",
+    "N",
+    "W",
+    "IY2",
+    "T",
+    "AA1",
+    "ER1",
+    "EH2",
+    "OY0",
+    "UH2",
+    "UW1",
+    "Z",
+    "AW2",
+    "AW1",
+    "V",
+    "UW2",
+    "AA2",
+    "ER",
+    "AW0",
+    "UW0",
+    "R",
+    "OW1",
+    "EH1",
+    "ZH",
+    "AE0",
+    "IH2",
+    "IH",
+    "Y",
+    "JH",
+    "P",
+    "AY1",
+    "EY0",
+    "OY2",
+    "TH",
+    "HH",
+    "D",
+    "ER0",
+    "CH",
+    "AO1",
+    "AE1",
+    "AO2",
+    "OY1",
+    "AY2",
+    "IH1",
+    "OW0",
+    "L",
+    "SH",
+}
+
+ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停"
+# ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
+
+yue_symbols = {
+    "Yeot3",
+    "Yip1",
+    "Yyu3",
+    "Yeng4",
+    "Yut5",
+    "Yaan5",
+    "Ym5",
+    "Yaan6",
+    "Yang1",
+    "Yun4",
+    "Yon2",
+    "Yui5",
+    "Yun2",
+    "Yat3",
+    "Ye",
+    "Yeot1",
+    "Yoeng5",
+    "Yoek2",
+    "Yam2",
+    "Yeon6",
+    "Yu6",
+    "Yiu3",
+    "Yaang6",
+    "Yp5",
+    "Yai4",
+    "Yoek4",
+    "Yit6",
+    "Yam5",
+    "Yoeng6",
+    "Yg1",
+    "Yk3",
+    "Yoe4",
+    "Yam3",
+    "Yc",
+    "Yyu4",
+    "Yyut1",
+    "Yiu4",
+    "Ying3",
+    "Yip3",
+    "Yaap3",
+    "Yau3",
+    "Yan4",
+    "Yau1",
+    "Yap4",
+    "Yk6",
+    "Yok3",
+    "Yai1",
+    "Yeot6",
+    "Yan2",
+    "Yoek6",
+    "Yt1",
+    "Yoi1",
+    "Yit5",
+    "Yn4",
+    "Yaau3",
+    "Yau4",
+    "Yuk6",
+    "Ys",
+    "Yuk",
+    "Yin6",
+    "Yung6",
+    "Ya",
+    "You",
+    "Yaai5",
+    "Yau5",
+    "Yoi3",
+    "Yaak3",
+    "Yaat3",
+    "Ying2",
+    "Yok5",
+    "Yeng2",
+    "Yyut3",
+    "Yam1",
+    "Yip5",
+    "You1",
+    "Yam6",
+    "Yaa5",
+    "Yi6",
+    "Yek4",
+    "Yyu2",
+    "Yuk5",
+    "Yaam1",
+    "Yang2",
+    "Yai",
+    "Yiu6",
+    "Yin4",
+    "Yok4",
+    "Yot3",
+    "Yui2",
+    "Yeoi5",
+    "Yyun6",
+    "Yyu5",
+    "Yoi5",
+    "Yeot2",
+    "Yim4",
+    "Yeoi2",
+    "Yaan1",
+    "Yang6",
+    "Yong1",
+    "Yaang4",
+    "Yung5",
+    "Yeon1",
+    "Yin2",
+    "Ya3",
+    "Yaang3",
+    "Yg",
+    "Yk2",
+    "Yaau5",
+    "Yut1",
+    "Yt5",
+    "Yip4",
+    "Yung4",
+    "Yj",
+    "Yong3",
+    "Ya1",
+    "Yg6",
+    "Yaau6",
+    "Yit3",
+    "Yun3",
+    "Ying1",
+    "Yn2",
+    "Yg4",
+    "Yl",
+    "Yp3",
+    "Yn3",
+    "Yak1",
+    "Yang5",
+    "Yoe6",
+    "You2",
+    "Yap2",
+    "Yak2",
+    "Yt3",
+    "Yot5",
+    "Yim2",
+    "Yi1",
+    "Yn6",
+    "Yaat5",
+    "Yaam3",
+    "Yoek5",
+    "Ye3",
+    "Yeon4",
+    "Yaa2",
+    "Yu3",
+    "Yim6",
+    "Ym",
+    "Yoe3",
+    "Yaai2",
+    "Ym2",
+    "Ya6",
+    "Yeng6",
+    "Yik4",
+    "Yot4",
+    "Yaai4",
+    "Yyun3",
+    "Yu1",
+    "Yoeng1",
+    "Yaap2",
+    "Yuk3",
+    "Yoek3",
+    "Yeng5",
+    "Yeoi1",
+    "Yiu2",
+    "Yok1",
+    "Yo1",
+    "Yoek1",
+    "Yoeng2",
+    "Yeon5",
+    "Yiu1",
+    "Yoeng4",
+    "Yuk2",
+    "Yat4",
+    "Yg5",
+    "Yut4",
+    "Yan6",
+    "Yin3",
+    "Yaa6",
+    "Yap1",
+    "Yg2",
+    "Yoe5",
+    "Yt4",
+    "Ya5",
+    "Yo4",
+    "Yyu1",
+    "Yak3",
+    "Yeon2",
+    "Yong4",
+    "Ym1",
+    "Ye2",
+    "Yaang5",
+    "Yoi2",
+    "Yeng3",
+    "Yn",
+    "Yyut4",
+    "Yau",
+    "Yaak2",
+    "Yaan4",
+    "Yek2",
+    "Yin1",
+    "Yi5",
+    "Yoe2",
+    "Yei5",
+    "Yaat6",
+    "Yak5",
+    "Yp6",
+    "Yok6",
+    "Yei2",
+    "Yaap1",
+    "Yyut5",
+    "Yi4",
+    "Yim1",
+    "Yk5",
+    "Ye4",
+    "Yok2",
+    "Yaam6",
+    "Yat2",
+    "Yon6",
+    "Yei3",
+    "Yyu6",
+    "Yeot5",
+    "Yk4",
+    "Yai6",
+    "Yd",
+    "Yg3",
+    "Yei6",
+    "Yau2",
+    "Yok",
+    "Yau6",
+    "Yung3",
+    "Yim5",
+    "Yut6",
+    "Yit1",
+    "Yon3",
+    "Yat1",
+    "Yaam2",
+    "Yyut2",
+    "Yui6",
+    "Yt2",
+    "Yek6",
+    "Yt",
+    "Ye6",
+    "Yang3",
+    "Ying6",
+    "Yaau1",
+    "Yeon3",
+    "Yng",
+    "Yh",
+    "Yang4",
+    "Ying5",
+    "Yaap6",
+    "Yoeng3",
+    "Yyun4",
+    "You3",
+    "Yan5",
+    "Yat5",
+    "Yot1",
+    "Yun1",
+    "Yi3",
+    "Yaa1",
+    "Yaap4",
+    "You6",
+    "Yaang2",
+    "Yaap5",
+    "Yaa3",
+    "Yaak6",
+    "Yeng1",
+    "Yaak1",
+    "Yo5",
+    "Yoi4",
+    "Yam4",
+    "Yik1",
+    "Ye1",
+    "Yai5",
+    "Yung1",
+    "Yp2",
+    "Yui4",
+    "Yaak4",
+    "Yung2",
+    "Yak4",
+    "Yaat4",
+    "Yeoi4",
+    "Yut2",
+    "Yin5",
+    "Yaau4",
+    "Yap6",
+    "Yb",
+    "Yaam4",
+    "Yw",
+    "Yut3",
+    "Yong2",
+    "Yt6",
+    "Yaai6",
+    "Yap5",
+    "Yik5",
+    "Yun6",
+    "Yaam5",
+    "Yun5",
+    "Yik3",
+    "Ya2",
+    "Yyut6",
+    "Yon4",
+    "Yk1",
+    "Yit4",
+    "Yak6",
+    "Yaan2",
+    "Yuk1",
+    "Yai2",
+    "Yik2",
+    "Yaat2",
+    "Yo3",
+    "Ykw",
+    "Yn5",
+    "Yaa",
+    "Ye5",
+    "Yu4",
+    "Yei1",
+    "Yai3",
+    "Yyun5",
+    "Yip2",
+    "Yaau2",
+    "Yiu5",
+    "Ym4",
+    "Yeoi6",
+    "Yk",
+    "Ym6",
+    "Yoe1",
+    "Yeoi3",
+    "Yon",
+    "Yuk4",
+    "Yaai3",
+    "Yaa4",
+    "Yot6",
+    "Yaang1",
+    "Yei4",
+    "Yek1",
+    "Yo",
+    "Yp",
+    "Yo6",
+    "Yp4",
+    "Yan3",
+    "Yoi",
+    "Yap3",
+    "Yek3",
+    "Yim3",
+    "Yz",
+    "Yot2",
+    "Yoi6",
+    "Yit2",
+    "Yu5",
+    "Yaan3",
+    "Yan1",
+    "Yon5",
+    "Yp1",
+    "Yong5",
+    "Ygw",
+    "Yak",
+    "Yat6",
+    "Ying4",
+    "Yu2",
+    "Yf",
+    "Ya4",
+    "Yon1",
+    "You4",
+    "Yik6",
+    "Yui1",
+    "Yaat1",
+    "Yeot4",
+    "Yi2",
+    "Yaai1",
+    "Yek5",
+    "Ym3",
+    "Yong6",
+    "You5",
+    "Yyun1",
+    "Yn1",
+    "Yo2",
+    "Yip6",
+    "Yui3",
+    "Yaak5",
+    "Yyun2",
+}
+
+# symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)+list(ko_symbols)#+list(yue_symbols)###直接这么加yue顺序乱了
+symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
+symbols = sorted(set(symbols))
+# print(len(symbols))
+symbols += ["[", "]"]  ##日文新增上升下降调型
+symbols += sorted(list(ko_symbols))
+symbols += sorted(list(yue_symbols))  ##新加的yue统一摆在后头#已查过开头加Y后没有重复，韩文显然不会重复
+# print(len(symbols))
+if __name__ == "__main__":
+    print(len(symbols))
+"""
+粤语：
+    732-353=379
+韩文+粤语：
+    732-322=410
+"""
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/tone_sandhi.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/tone_sandhi.py
@@ -0,0 +1,774 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from typing import Tuple
+
+import jieba_fast as jieba
+from pypinyin import lazy_pinyin
+from pypinyin import Style
+
+
+class ToneSandhi:
+    def __init__(self):
+        self.must_neural_tone_words = {
+            "麻烦",
+            "麻利",
+            "鸳鸯",
+            "高粱",
+            "骨头",
+            "骆驼",
+            "马虎",
+            "首饰",
+            "馒头",
+            "馄饨",
+            "风筝",
+            "难为",
+            "队伍",
+            "阔气",
+            "闺女",
+            "门道",
+            "锄头",
+            "铺盖",
+            "铃铛",
+            "铁匠",
+            "钥匙",
+            "里脊",
+            "里头",
+            "部分",
+            "那么",
+            "道士",
+            "造化",
+            "迷糊",
+            "连累",
+            "这么",
+            "这个",
+            "运气",
+            "过去",
+            "软和",
+            "转悠",
+            "踏实",
+            "跳蚤",
+            "跟头",
+            "趔趄",
+            "财主",
+            "豆腐",
+            "讲究",
+            "记性",
+            "记号",
+            "认识",
+            "规矩",
+            "见识",
+            "裁缝",
+            "补丁",
+            "衣裳",
+            "衣服",
+            "衙门",
+            "街坊",
+            "行李",
+            "行当",
+            "蛤蟆",
+            "蘑菇",
+            "薄荷",
+            "葫芦",
+            "葡萄",
+            "萝卜",
+            "荸荠",
+            "苗条",
+            "苗头",
+            "苍蝇",
+            "芝麻",
+            "舒服",
+            "舒坦",
+            "舌头",
+            "自在",
+            "膏药",
+            "脾气",
+            "脑袋",
+            "脊梁",
+            "能耐",
+            "胳膊",
+            "胭脂",
+            "胡萝",
+            "胡琴",
+            "胡同",
+            "聪明",
+            "耽误",
+            "耽搁",
+            "耷拉",
+            "耳朵",
+            "老爷",
+            "老实",
+            "老婆",
+            "老头",
+            "老太",
+            "翻腾",
+            "罗嗦",
+            "罐头",
+            "编辑",
+            "结实",
+            "红火",
+            "累赘",
+            "糨糊",
+            "糊涂",
+            "精神",
+            "粮食",
+            "簸箕",
+            "篱笆",
+            "算计",
+            "算盘",
+            "答应",
+            "笤帚",
+            "笑语",
+            "笑话",
+            "窟窿",
+            "窝囊",
+            "窗户",
+            "稳当",
+            "稀罕",
+            "称呼",
+            "秧歌",
+            "秀气",
+            "秀才",
+            "福气",
+            "祖宗",
+            "砚台",
+            "码头",
+            "石榴",
+            "石头",
+            "石匠",
+            "知识",
+            "眼睛",
+            "眯缝",
+            "眨巴",
+            "眉毛",
+            "相声",
+            "盘算",
+            "白净",
+            "痢疾",
+            "痛快",
+            "疟疾",
+            "疙瘩",
+            "疏忽",
+            "畜生",
+            "生意",
+            "甘蔗",
+            "琵琶",
+            "琢磨",
+            "琉璃",
+            "玻璃",
+            "玫瑰",
+            "玄乎",
+            "狐狸",
+            "状元",
+            "特务",
+            "牲口",
+            "牙碜",
+            "牌楼",
+            "爽快",
+            "爱人",
+            "热闹",
+            "烧饼",
+            "烟筒",
+            "烂糊",
+            "点心",
+            "炊帚",
+            "灯笼",
+            "火候",
+            "漂亮",
+            "滑溜",
+            "溜达",
+            "温和",
+            "清楚",
+            "消息",
+            "浪头",
+            "活泼",
+            "比方",
+            "正经",
+            "欺负",
+            "模糊",
+            "槟榔",
+            "棺材",
+            "棒槌",
+            "棉花",
+            "核桃",
+            "栅栏",
+            "柴火",
+            "架势",
+            "枕头",
+            "枇杷",
+            "机灵",
+            "本事",
+            "木头",
+            "木匠",
+            "朋友",
+            "月饼",
+            "月亮",
+            "暖和",
+            "明白",
+            "时候",
+            "新鲜",
+            "故事",
+            "收拾",
+            "收成",
+            "提防",
+            "挖苦",
+            "挑剔",
+            "指甲",
+            "指头",
+            "拾掇",
+            "拳头",
+            "拨弄",
+            "招牌",
+            "招呼",
+            "抬举",
+            "护士",
+            "折腾",
+            "扫帚",
+            "打量",
+            "打算",
+            "打点",
+            "打扮",
+            "打听",
+            "打发",
+            "扎实",
+            "扁担",
+            "戒指",
+            "懒得",
+            "意识",
+            "意思",
+            "情形",
+            "悟性",
+            "怪物",
+            "思量",
+            "怎么",
+            "念头",
+            "念叨",
+            "快活",
+            "忙活",
+            "志气",
+            "心思",
+            "得罪",
+            "张罗",
+            "弟兄",
+            "开通",
+            "应酬",
+            "庄稼",
+            "干事",
+            "帮手",
+            "帐篷",
+            "希罕",
+            "师父",
+            "师傅",
+            "巴结",
+            "巴掌",
+            "差事",
+            "工夫",
+            "岁数",
+            "屁股",
+            "尾巴",
+            "少爷",
+            "小气",
+            "小伙",
+            "将就",
+            "对头",
+            "对付",
+            "寡妇",
+            "家伙",
+            "客气",
+            "实在",
+            "官司",
+            "学问",
+            "学生",
+            "字号",
+            "嫁妆",
+            "媳妇",
+            "媒人",
+            "婆家",
+            "娘家",
+            "委屈",
+            "姑娘",
+            "姐夫",
+            "妯娌",
+            "妥当",
+            "妖精",
+            "奴才",
+            "女婿",
+            "头发",
+            "太阳",
+            "大爷",
+            "大方",
+            "大意",
+            "大夫",
+            "多少",
+            "多么",
+            "外甥",
+            "壮实",
+            "地道",
+            "地方",
+            "在乎",
+            "困难",
+            "嘴巴",
+            "嘱咐",
+            "嘟囔",
+            "嘀咕",
+            "喜欢",
+            "喇嘛",
+            "喇叭",
+            "商量",
+            "唾沫",
+            "哑巴",
+            "哈欠",
+            "哆嗦",
+            "咳嗽",
+            "和尚",
+            "告诉",
+            "告示",
+            "含糊",
+            "吓唬",
+            "后头",
+            "名字",
+            "名堂",
+            "合同",
+            "吆喝",
+            "叫唤",
+            "口袋",
+            "厚道",
+            "厉害",
+            "千斤",
+            "包袱",
+            "包涵",
+            "匀称",
+            "勤快",
+            "动静",
+            "动弹",
+            "功夫",
+            "力气",
+            "前头",
+            "刺猬",
+            "刺激",
+            "别扭",
+            "利落",
+            "利索",
+            "利害",
+            "分析",
+            "出息",
+            "凑合",
+            "凉快",
+            "冷战",
+            "冤枉",
+            "冒失",
+            "养活",
+            "关系",
+            "先生",
+            "兄弟",
+            "便宜",
+            "使唤",
+            "佩服",
+            "作坊",
+            "体面",
+            "位置",
+            "似的",
+            "伙计",
+            "休息",
+            "什么",
+            "人家",
+            "亲戚",
+            "亲家",
+            "交情",
+            "云彩",
+            "事情",
+            "买卖",
+            "主意",
+            "丫头",
+            "丧气",
+            "两口",
+            "东西",
+            "东家",
+            "世故",
+            "不由",
+            "不在",
+            "下水",
+            "下巴",
+            "上头",
+            "上司",
+            "丈夫",
+            "丈人",
+            "一辈",
+            "那个",
+            "菩萨",
+            "父亲",
+            "母亲",
+            "咕噜",
+            "邋遢",
+            "费用",
+            "冤家",
+            "甜头",
+            "介绍",
+            "荒唐",
+            "大人",
+            "泥鳅",
+            "幸福",
+            "熟悉",
+            "计划",
+            "扑腾",
+            "蜡烛",
+            "姥爷",
+            "照顾",
+            "喉咙",
+            "吉他",
+            "弄堂",
+            "蚂蚱",
+            "凤凰",
+            "拖沓",
+            "寒碜",
+            "糟蹋",
+            "倒腾",
+            "报复",
+            "逻辑",
+            "盘缠",
+            "喽啰",
+            "牢骚",
+            "咖喱",
+            "扫把",
+            "惦记",
+        }
+        self.must_not_neural_tone_words = {
+            "男子",
+            "女子",
+            "分子",
+            "原子",
+            "量子",
+            "莲子",
+            "石子",
+            "瓜子",
+            "电子",
+            "人人",
+            "虎虎",
+            "幺幺",
+            "干嘛",
+            "学子",
+            "哈哈",
+            "数数",
+            "袅袅",
+            "局地",
+            "以下",
+            "娃哈哈",
+            "花花草草",
+            "留得",
+            "耕地",
+            "想想",
+            "熙熙",
+            "攘攘",
+            "卵子",
+            "死死",
+            "冉冉",
+            "恳恳",
+            "佼佼",
+            "吵吵",
+            "打打",
+            "考考",
+            "整整",
+            "莘莘",
+            "落地",
+            "算子",
+            "家家户户",
+            "青青",
+        }
+        self.punc = "：，；。？！“”‘’':,;.?!"
+
+    # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
+    # e.g.
+    # word: "家里"
+    # pos: "s"
+    # finals: ['ia1', 'i3']
+    def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
+        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
+        for j, item in enumerate(word):
+            if (
+                j - 1 >= 0
+                and item == word[j - 1]
+                and pos[0] in {"n", "v", "a"}
+                and word not in self.must_not_neural_tone_words
+            ):
+                finals[j] = finals[j][:-1] + "5"
+        ge_idx = word.find("个")
+        if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
+            finals[-1] = finals[-1][:-1] + "5"
+        elif len(word) >= 1 and word[-1] in "的地得":
+            finals[-1] = finals[-1][:-1] + "5"
+        # e.g. 走了, 看着, 去过
+        elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
+            finals[-1] = finals[-1][:-1] + "5"
+        elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words:
+            finals[-1] = finals[-1][:-1] + "5"
+        # e.g. 桌上, 地下, 家里
+        elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
+            finals[-1] = finals[-1][:-1] + "5"
+        # e.g. 上来, 下去
+        elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
+            finals[-1] = finals[-1][:-1] + "5"
+        # 个做量词
+        elif (
+            ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
+        ) or word == "个":
+            finals[ge_idx] = finals[ge_idx][:-1] + "5"
+        else:
+            if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
+                finals[-1] = finals[-1][:-1] + "5"
+
+        word_list = self._split_word(word)
+        finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
+        for i, word in enumerate(word_list):
+            # conventional neural in Chinese
+            if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
+                finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
+        finals = sum(finals_list, [])
+        return finals
+
+    def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
+        # e.g. 看不懂
+        if len(word) == 3 and word[1] == "不":
+            finals[1] = finals[1][:-1] + "5"
+        else:
+            for i, char in enumerate(word):
+                # "不" before tone4 should be bu2, e.g. 不怕
+                if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
+                    finals[i] = finals[i][:-1] + "2"
+        return finals
+
+    def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
+        # "一" in number sequences, e.g. 一零零, 二一零
+        if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]):
+            return finals
+        # "一" between reduplication words shold be yi5, e.g. 看一看
+        elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
+            finals[1] = finals[1][:-1] + "5"
+        # when "一" is ordinal word, it should be yi1
+        elif word.startswith("第一"):
+            finals[1] = finals[1][:-1] + "1"
+        else:
+            for i, char in enumerate(word):
+                if char == "一" and i + 1 < len(word):
+                    # "一" before tone4 should be yi2, e.g. 一段
+                    if finals[i + 1][-1] == "4":
+                        finals[i] = finals[i][:-1] + "2"
+                    # "一" before non-tone4 should be yi4, e.g. 一天
+                    else:
+                        # "一" 后面如果是标点，还读一声
+                        if word[i + 1] not in self.punc:
+                            finals[i] = finals[i][:-1] + "4"
+        return finals
+
+    def _split_word(self, word: str) -> List[str]:
+        word_list = jieba.cut_for_search(word)
+        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+        first_subword = word_list[0]
+        first_begin_idx = word.find(first_subword)
+        if first_begin_idx == 0:
+            second_subword = word[len(first_subword) :]
+            new_word_list = [first_subword, second_subword]
+        else:
+            second_subword = word[: -len(first_subword)]
+            new_word_list = [second_subword, first_subword]
+        return new_word_list
+
+    def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
+        if len(word) == 2 and self._all_tone_three(finals):
+            finals[0] = finals[0][:-1] + "2"
+        elif len(word) == 3:
+            word_list = self._split_word(word)
+            if self._all_tone_three(finals):
+                #  disyllabic + monosyllabic, e.g. 蒙古/包
+                if len(word_list[0]) == 2:
+                    finals[0] = finals[0][:-1] + "2"
+                    finals[1] = finals[1][:-1] + "2"
+                #  monosyllabic + disyllabic, e.g. 纸/老虎
+                elif len(word_list[0]) == 1:
+                    finals[1] = finals[1][:-1] + "2"
+            else:
+                finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
+                if len(finals_list) == 2:
+                    for i, sub in enumerate(finals_list):
+                        # e.g. 所有/人
+                        if self._all_tone_three(sub) and len(sub) == 2:
+                            finals_list[i][0] = finals_list[i][0][:-1] + "2"
+                        # e.g. 好/喜欢
+                        elif (
+                            i == 1
+                            and not self._all_tone_three(sub)
+                            and finals_list[i][0][-1] == "3"
+                            and finals_list[0][-1][-1] == "3"
+                        ):
+                            finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
+                        finals = sum(finals_list, [])
+        # split idiom into two words who's length is 2
+        elif len(word) == 4:
+            finals_list = [finals[:2], finals[2:]]
+            finals = []
+            for sub in finals_list:
+                if self._all_tone_three(sub):
+                    sub[0] = sub[0][:-1] + "2"
+                finals += sub
+
+        return finals
+
+    def _all_tone_three(self, finals: List[str]) -> bool:
+        return all(x[-1] == "3" for x in finals)
+
+    # merge "不" and the word behind it
+    # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
+    def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        last_word = ""
+        for word, pos in seg:
+            if last_word == "不":
+                word = last_word + word
+            if word != "不":
+                new_seg.append((word, pos))
+            last_word = word[:]
+        if last_word == "不":
+            new_seg.append((last_word, "d"))
+            last_word = ""
+        return new_seg
+
+    # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
+    # function 2: merge single  "一" and the word behind it
+    # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
+    # e.g.
+    # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
+    # output seg: [['听一听', 'v']]
+    def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        i = 0
+        # function 1
+        while i < len(seg):
+            word, pos = seg[i]
+            merged = False
+            if i - 1 >= 0 and word == "一" and i + 1 < len(seg):
+                last = new_seg[-1] if new_seg else seg[i - 1]
+                if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
+                    combined = last[0] + "一" + seg[i + 1][0]
+                    new_seg[-1] = [combined, last[1]]
+                    i += 2
+                    merged = True
+            if not merged:
+                new_seg.append([word, pos])
+                i += 1
+        seg = new_seg
+        new_seg = []
+        # function 2
+        for word, pos in seg:
+            if new_seg and new_seg[-1][0] == "一":
+                new_seg[-1][0] = new_seg[-1][0] + word
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    # the first and the second words are all_tone_three
+    def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        sub_finals_list = [
+            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
+        ]
+        assert len(sub_finals_list) == len(seg)
+        merge_last = [False] * len(seg)
+        for i, (word, pos) in enumerate(seg):
+            if (
+                i - 1 >= 0
+                and self._all_tone_three(sub_finals_list[i - 1])
+                and self._all_tone_three(sub_finals_list[i])
+                and not merge_last[i - 1]
+            ):
+                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
+                if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+                    merge_last[i] = True
+                else:
+                    new_seg.append([word, pos])
+            else:
+                new_seg.append([word, pos])
+
+        return new_seg
+
+    def _is_reduplication(self, word: str) -> bool:
+        return len(word) == 2 and word[0] == word[1]
+
+    # the last char of first word and the first char of second word is tone_three
+    def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        sub_finals_list = [
+            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
+        ]
+        assert len(sub_finals_list) == len(seg)
+        merge_last = [False] * len(seg)
+        for i, (word, pos) in enumerate(seg):
+            if (
+                i - 1 >= 0
+                and sub_finals_list[i - 1][-1][-1] == "3"
+                and sub_finals_list[i][0][-1] == "3"
+                and not merge_last[i - 1]
+            ):
+                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
+                if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
+                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+                    merge_last[i] = True
+                else:
+                    new_seg.append([word, pos])
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        for i, (word, pos) in enumerate(seg):
+            if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
+                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        new_seg = []
+        for i, (word, pos) in enumerate(seg):
+            if new_seg and word == new_seg[-1][0]:
+                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
+            else:
+                new_seg.append([word, pos])
+        return new_seg
+
+    def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        seg = self._merge_bu(seg)
+        try:
+            seg = self._merge_yi(seg)
+        except:
+            print("_merge_yi failed")
+        seg = self._merge_reduplication(seg)
+        try:
+            seg = self._merge_continuous_three_tones(seg)
+        except:
+            print("_merge_continuous_three_tones failed")
+        try:
+            seg = self._merge_continuous_three_tones_2(seg)
+        except:
+            print("_merge_continuous_three_tones_2 failed")
+
+        seg = self._merge_er(seg)
+        return seg
+
+    def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
+        finals = self._bu_sandhi(word, finals)
+        finals = self._yi_sandhi(word, finals)
+        finals = self._neural_sandhi(word, pos, finals)
+        finals = self._three_sandhi(word, finals)
+        return finals
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/README.md
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/README.md
@@ -0,0 +1,16 @@
+## Supported NSW (Non-Standard-Word) Normalization
+
+|NSW type|raw|normalized|
+|:--|:-|:-|
+|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
+|cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
+|numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
+|date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
+|time|等会请在12:05请通知我|等会请在十二点零五分请通知我
+|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
+|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
+|percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
+|money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
+|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
+## References
+[Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/init.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/init.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from text.zh_normalization.text_normlization import *
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/char_convert.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/char_convert.py
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/chronology.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/chronology.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from .num import DIGITS
+from .num import num2str
+from .num import verbalize_cardinal
+from .num import verbalize_digit
+
+
+def _time_num2str(num_string: str) -> str:
+    """A special case for verbalizing number in time."""
+    result = num2str(num_string.lstrip("0"))
+    if num_string.startswith("0"):
+        result = DIGITS["0"] + result
+    return result
+
+
+# 时刻表达式
+RE_TIME = re.compile(
+    r"([0-1]?[0-9]|2[0-3])"
+    r":([0-5][0-9])"
+    r"(:([0-5][0-9]))?"
+)
+
+# 时间范围，如8:30-12:30
+RE_TIME_RANGE = re.compile(
+    r"([0-1]?[0-9]|2[0-3])"
+    r":([0-5][0-9])"
+    r"(:([0-5][0-9]))?"
+    r"(~|-)"
+    r"([0-1]?[0-9]|2[0-3])"
+    r":([0-5][0-9])"
+    r"(:([0-5][0-9]))?"
+)
+
+
+def replace_time(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+
+    is_range = len(match.groups()) > 5
+
+    hour = match.group(1)
+    minute = match.group(2)
+    second = match.group(4)
+
+    if is_range:
+        hour_2 = match.group(6)
+        minute_2 = match.group(7)
+        second_2 = match.group(9)
+
+    result = f"{num2str(hour)}点"
+    if minute.lstrip("0"):
+        if int(minute) == 30:
+            result += "半"
+        else:
+            result += f"{_time_num2str(minute)}分"
+    if second and second.lstrip("0"):
+        result += f"{_time_num2str(second)}秒"
+
+    if is_range:
+        result += "至"
+        result += f"{num2str(hour_2)}点"
+        if minute_2.lstrip("0"):
+            if int(minute) == 30:
+                result += "半"
+            else:
+                result += f"{_time_num2str(minute_2)}分"
+        if second_2 and second_2.lstrip("0"):
+            result += f"{_time_num2str(second_2)}秒"
+
+    return result
+
+
+RE_DATE = re.compile(
+    r"(\d{4}|\d{2})年"
+    r"((0?[1-9]|1[0-2])月)?"
+    r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"
+)
+
+
+def replace_date(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(5)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}{match.group(9)}"
+    return result
+
+
+# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
+RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])")
+
+
+def replace_date2(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    year = match.group(1)
+    month = match.group(3)
+    day = match.group(4)
+    result = ""
+    if year:
+        result += f"{verbalize_digit(year)}年"
+    if month:
+        result += f"{verbalize_cardinal(month)}月"
+    if day:
+        result += f"{verbalize_cardinal(day)}日"
+    return result
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/constants.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/constants.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import string
+
+from pypinyin.constants import SUPPORT_UCS4
+
+# 全角半角转换
+# 英文字符全角 -> 半角映射表 (num: 52)
+F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}
+
+# 英文字符半角 -> 全角映射表
+H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
+
+# 数字字符全角 -> 半角映射表 (num: 10)
+F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
+# 数字字符半角 -> 全角映射表
+H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
+
+# 标点符号全角 -> 半角映射表 (num: 32)
+F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
+# 标点符号半角 -> 全角映射表
+H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
+
+# 空格 (num: 1)
+F2H_SPACE = {"\u3000": " "}
+H2F_SPACE = {" ": "\u3000"}
+
+# 非"有拼音的汉字"的字符串，可用于NSW提取
+if SUPPORT_UCS4:
+    RE_NSW = re.compile(
+        r"(?:[^"
+        r"\u3007"  # 〇
+        r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
+        r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
+        r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
+        r"\U00020000-\U0002A6DF"  # CJK扩展B:[20000-2A6DF]
+        r"\U0002A703-\U0002B73F"  # CJK扩展C:[2A700-2B73F]
+        r"\U0002B740-\U0002B81D"  # CJK扩展D:[2B740-2B81D]
+        r"\U0002F80A-\U0002FA1F"  # CJK兼容扩展:[2F800-2FA1F]
+        r"])+"
+    )
+else:
+    RE_NSW = re.compile(  # pragma: no cover
+        r"(?:[^"
+        r"\u3007"  # 〇
+        r"\u3400-\u4dbf"  # CJK扩展A:[3400-4DBF]
+        r"\u4e00-\u9fff"  # CJK基本:[4E00-9FFF]
+        r"\uf900-\ufaff"  # CJK兼容:[F900-FAFF]
+        r"])+"
+    )
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/num.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/num.py
@@ -0,0 +1,339 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Rules to verbalize numbers into Chinese characters.
+https://zh.wikipedia.org/wiki/中文数字#現代中文
+"""
+
+import re
+from collections import OrderedDict
+from typing import List
+
+DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
+UNITS = OrderedDict(
+    {
+        1: "十",
+        2: "百",
+        3: "千",
+        4: "万",
+        8: "亿",
+    }
+)
+
+COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
+
+# 分数表达式
+RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
+
+
+def replace_frac(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    nominator = match.group(2)
+    denominator = match.group(3)
+    sign: str = "负" if sign else ""
+    nominator: str = num2str(nominator)
+    denominator: str = num2str(denominator)
+    result = f"{sign}{denominator}分之{nominator}"
+    return result
+
+
+# 百分数表达式
+RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
+
+
+def replace_percentage(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    percent = match.group(2)
+    sign: str = "负" if sign else ""
+    percent: str = num2str(percent)
+    result = f"{sign}百分之{percent}"
+    return result
+
+
+# 整数表达式
+# 带负号的整数 -10
+RE_INTEGER = re.compile(r"(-)" r"(\d+)")
+
+
+def replace_negative_num(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    number = match.group(2)
+    sign: str = "负" if sign else ""
+    number: str = num2str(number)
+    result = f"{sign}{number}"
+    return result
+
+
+# 编号-无符号整形
+# 00078
+RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
+
+
+def replace_default_num(match):
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    number = match.group(0)
+    return verbalize_digit(number, alt_one=True)
+
+
+# 加减乘除
+# RE_ASMD = re.compile(
+#     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
+RE_ASMD = re.compile(
+    r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
+)
+
+asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"}
+
+
+def replace_asmd(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
+    return result
+
+
+# 次方专项
+RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
+
+power_map = {
+    "⁰": "0",
+    "¹": "1",
+    "²": "2",
+    "³": "3",
+    "⁴": "4",
+    "⁵": "5",
+    "⁶": "6",
+    "⁷": "7",
+    "⁸": "8",
+    "⁹": "9",
+    "ˣ": "x",
+    "ʸ": "y",
+    "ⁿ": "n",
+}
+
+
+def replace_power(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    power_num = ""
+    for m in match.group(0):
+        power_num += power_map[m]
+    result = "的" + power_num + "次方"
+    return result
+
+
+# 数字表达式
+# 纯小数
+RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
+# 正整数 + 量词
+RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
+RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
+
+
+def replace_positive_quantifier(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    number = match.group(1)
+    match_2 = match.group(2)
+    if match_2 == "+":
+        match_2 = "多"
+    match_2: str = match_2 if match_2 else ""
+    quantifiers: str = match.group(3)
+    number: str = num2str(number)
+    number = "两" if number == "二" else number
+    result = f"{number}{match_2}{quantifiers}"
+    return result
+
+
+def replace_number(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    number = match.group(2)
+    pure_decimal = match.group(5)
+    if pure_decimal:
+        result = num2str(pure_decimal)
+    else:
+        sign: str = "负" if sign else ""
+        number: str = num2str(number)
+        result = f"{sign}{number}"
+    return result
+
+
+# 范围表达式
+# match.group(1) and match.group(8) are copy from RE_NUMBER
+
+RE_RANGE = re.compile(
+    r"""
+    (?<![\d\+\-\×÷=])      # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
+    ((-?)((\d+)(\.\d+)?))  # 匹配范围起始的负数或正数（整数或小数）
+    [-~]                   # 匹配范围分隔符
+    ((-?)((\d+)(\.\d+)?))  # 匹配范围结束的负数或正数（整数或小数）
+    (?![\d\+\-\×÷=])       # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
+    """,
+    re.VERBOSE,
+)
+
+
+def replace_range(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    first, second = match.group(1), match.group(6)
+    first = RE_NUMBER.sub(replace_number, first)
+    second = RE_NUMBER.sub(replace_number, second)
+    result = f"{first}到{second}"
+    return result
+
+
+# ~至表达式
+RE_TO_RANGE = re.compile(
+    r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
+)
+
+
+def replace_to_range(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    result = match.group(0).replace("~", "至")
+    return result
+
+
+RE_VERSION_NUM = re.compile(r"((\d+)(\.\d+)(\.\d+)?(\.\d+)+)")
+def replace_vrsion_num(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    result = ""
+    for c in match.group(1):
+        if c == ".":
+            result += "点"
+        else:
+            result += num2str(c)
+    return result
+
+
+
+def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
+    stripped = value_string.lstrip("0")
+    if len(stripped) == 0:
+        return []
+    elif len(stripped) == 1:
+        if use_zero and len(stripped) < len(value_string):
+            return [DIGITS["0"], DIGITS[stripped]]
+        else:
+            return [DIGITS[stripped]]
+    else:
+        largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
+        first_part = value_string[:-largest_unit]
+        second_part = value_string[-largest_unit:]
+        return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
+
+
+def verbalize_cardinal(value_string: str) -> str:
+    if not value_string:
+        return ""
+
+    # 000 -> '零' , 0 -> '零'
+    value_string = value_string.lstrip("0")
+    if len(value_string) == 0:
+        return DIGITS["0"]
+
+    result_symbols = _get_value(value_string)
+    # verbalized number starting with '一十*' is abbreviated as `十*`
+    if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]:
+        result_symbols = result_symbols[1:]
+    return "".join(result_symbols)
+
+
+def verbalize_digit(value_string: str, alt_one=False) -> str:
+    result_symbols = [DIGITS[digit] for digit in value_string]
+    result = "".join(result_symbols)
+    if alt_one:
+        result = result.replace("一", "幺")
+    return result
+
+
+def num2str(value_string: str) -> str:
+    integer_decimal = value_string.split(".")
+    if len(integer_decimal) == 1:
+        integer = integer_decimal[0]
+        decimal = ""
+    elif len(integer_decimal) == 2:
+        integer, decimal = integer_decimal
+    else:
+        raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
+
+    result = verbalize_cardinal(integer)
+
+    if decimal.endswith("0"):
+        decimal = decimal.rstrip("0") + "0"
+    else:
+        decimal = decimal.rstrip("0")
+
+    if decimal:
+        # '.22' is verbalized as '零点二二'
+        # '3.20' is verbalized as '三点二
+        result = result if result else "零"
+        result += "点" + verbalize_digit(decimal)
+    return result
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/phonecode.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/phonecode.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from .num import verbalize_digit
+
+# 规范化固话/手机号码
+# 手机
+# http://www.jihaoba.com/news/show/13680
+# 移动：139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
+# 联通：130、131、132、156、155、186、185、176
+# 电信：133、153、189、180、181、177
+RE_MOBILE_PHONE = re.compile(r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
+RE_TELEPHONE = re.compile(r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
+
+# 全国统一的号码400开头
+RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
+
+
+def phone2str(phone_string: str, mobile=True) -> str:
+    if mobile:
+        sp_parts = phone_string.strip("+").split()
+        result = "，".join([verbalize_digit(part, alt_one=True) for part in sp_parts])
+        return result
+    else:
+        sil_parts = phone_string.split("-")
+        result = "，".join([verbalize_digit(part, alt_one=True) for part in sil_parts])
+        return result
+
+
+def replace_phone(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    return phone2str(match.group(0), mobile=False)
+
+
+def replace_mobile(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    return phone2str(match.group(0))
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/quantifier.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/quantifier.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from .num import num2str
+
+# 温度表达式，温度会影响负号的读法
+# -3°C 零下三度
+RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)")
+measure_dict = {
+    "cm2": "平方厘米",
+    "cm²": "平方厘米",
+    "cm3": "立方厘米",
+    "cm³": "立方厘米",
+    "cm": "厘米",
+    "db": "分贝",
+    "ds": "毫秒",
+    "kg": "千克",
+    "km": "千米",
+    "m2": "平方米",
+    "m²": "平方米",
+    "m³": "立方米",
+    "m3": "立方米",
+    "ml": "毫升",
+    "m": "米",
+    "mm": "毫米",
+    "s": "秒",
+}
+
+
+def replace_temperature(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    temperature = match.group(2)
+    unit = match.group(3)
+    sign: str = "零下" if sign else ""
+    temperature: str = num2str(temperature)
+    unit: str = "摄氏度" if unit == "摄氏度" else "度"
+    result = f"{sign}{temperature}{unit}"
+    return result
+
+
+def replace_measure(sentence) -> str:
+    for q_notation in measure_dict:
+        if q_notation in sentence:
+            sentence = sentence.replace(q_notation, measure_dict[q_notation])
+    return sentence
--- a/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/text_normlization.py
+++ b/mlu_370-gpt-sovits/GPT-SoVITS/GPT_SoVITS/text/zh_normalization/text_normlization.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import List
+
+from .char_convert import tranditional_to_simplified
+from .chronology import RE_DATE
+from .chronology import RE_DATE2
+from .chronology import RE_TIME
+from .chronology import RE_TIME_RANGE
+from .chronology import replace_date
+from .chronology import replace_date2
+from .chronology import replace_time
+from .constants import F2H_ASCII_LETTERS
+from .constants import F2H_DIGITS
+from .constants import F2H_SPACE
+from .num import RE_VERSION_NUM
+from .num import RE_DECIMAL_NUM
+from .num import RE_DEFAULT_NUM
+from .num import RE_FRAC
+from .num import RE_INTEGER
+from .num import RE_NUMBER
+from .num import RE_PERCENTAGE
+from .num import RE_POSITIVE_QUANTIFIERS
+from .num import RE_RANGE
+from .num import RE_TO_RANGE
+from .num import RE_ASMD
+from .num import RE_POWER
+from .num import replace_vrsion_num
+from .num import replace_default_num
+from .num import replace_frac
+from .num import replace_negative_num
+from .num import replace_number
+from .num import replace_percentage
+from .num import replace_positive_quantifier
+from .num import replace_range
+from .num import replace_to_range
+from .num import replace_asmd
+from .num import replace_power
+from .phonecode import RE_MOBILE_PHONE
+from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
+from .phonecode import RE_TELEPHONE
+from .phonecode import replace_mobile
+from .phonecode import replace_phone
+from .quantifier import RE_TEMPERATURE
+from .quantifier import replace_measure
+from .quantifier import replace_temperature
+
+
+class TextNormalizer:
+    def __init__(self):
+        self.SENTENCE_SPLITOR = re.compile(r"([：、，；。？！,;?!][”’]?)")
+
+    def _split(self, text: str, lang="zh") -> List[str]:
+        """Split long text into sentences with sentence-splitting punctuations.
+        Args:
+            text (str): The input text.
+        Returns:
+            List[str]: Sentences.
+        """
+        # Only for pure Chinese here
+        if lang == "zh":
+            text = text.replace(" ", "")
+            # 过滤掉特殊字符
+            text = re.sub(r"[——《》【】<>{}()（）#&@“”^_|\\]", "", text)
+        text = self.SENTENCE_SPLITOR.sub(r"\1\n", text)
+        text = text.strip()
+        sentences = [sentence.strip() for sentence in re.split(r"\n+", text)]
+        return sentences
+
+    def _post_replace(self, sentence: str) -> str:
+        sentence = sentence.replace("/", "每")
+        # sentence = sentence.replace('~', '至')
+        # sentence = sentence.replace('～', '至')
+        sentence = sentence.replace("①", "一")
+        sentence = sentence.replace("②", "二")
+        sentence = sentence.replace("③", "三")
+        sentence = sentence.replace("④", "四")
+        sentence = sentence.replace("⑤", "五")
+        sentence = sentence.replace("⑥", "六")
+        sentence = sentence.replace("⑦", "七")
+        sentence = sentence.replace("⑧", "八")
+        sentence = sentence.replace("⑨", "九")
+        sentence = sentence.replace("⑩", "十")
+        sentence = sentence.replace("α", "阿尔法")
+        sentence = sentence.replace("β", "贝塔")
+        sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛")
+        sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔")
+        sentence = sentence.replace("ε", "艾普西龙")
+        sentence = sentence.replace("ζ", "捷塔")
+        sentence = sentence.replace("η", "依塔")
+        sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔")
+        sentence = sentence.replace("ι", "艾欧塔")
+        sentence = sentence.replace("κ", "喀帕")
+        sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达")
+        sentence = sentence.replace("μ", "缪")
+        sentence = sentence.replace("ν", "拗")
+        sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西")
+        sentence = sentence.replace("ο", "欧米克伦")
+        sentence = sentence.replace("π", "派").replace("Π", "派")
+        sentence = sentence.replace("ρ", "肉")
+        sentence = sentence.replace("ς", "西格玛").replace("Σ", "西格玛").replace("σ", "西格玛")
+        sentence = sentence.replace("τ", "套")
+        sentence = sentence.replace("υ", "宇普西龙")
+        sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾")
+        sentence = sentence.replace("χ", "器")
+        sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛")
+        sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽")
+        # 兜底数学运算，顺便兼容懒人用语
+        sentence = sentence.replace("+", "加")
+        sentence = sentence.replace("-", "减")
+        sentence = sentence.replace("×", "乘")
+        sentence = sentence.replace("÷", "除")
+        sentence = sentence.replace("=", "等")
+        # re filter special characters, have one more character "-" than line 68
+        sentence = re.sub(r"[-——《》【】<=>{}()（）#&@“”^_|\\]", "", sentence)
+        return sentence
+
+    def normalize_sentence(self, sentence: str) -> str:
+        # basic character conversions
+        sentence = tranditional_to_simplified(sentence)
+        sentence = sentence.translate(F2H_ASCII_LETTERS).translate(F2H_DIGITS).translate(F2H_SPACE)
+
+        # number related NSW verbalization
+        sentence = RE_DATE.sub(replace_date, sentence)
+        sentence = RE_DATE2.sub(replace_date2, sentence)
+
+        # range first
+        sentence = RE_TIME_RANGE.sub(replace_time, sentence)
+        sentence = RE_TIME.sub(replace_time, sentence)
+
+        # 处理~波浪号作为至的替换
+        sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
+        sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
+        sentence = replace_measure(sentence)
+
+        # 处理数学运算
+        while RE_ASMD.search(sentence):
+            sentence = RE_ASMD.sub(replace_asmd, sentence)
+        sentence = RE_POWER.sub(replace_power, sentence)
+
+        sentence = RE_FRAC.sub(replace_frac, sentence)
+        sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
+        sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
+
+        sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+        sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
+
+        sentence = RE_RANGE.sub(replace_range, sentence)
+
+        sentence = RE_INTEGER.sub(replace_negative_num, sentence)
+        sentence = RE_VERSION_NUM.sub(replace_vrsion_num, sentence)
+        sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
+        sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence)
+        sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
+        sentence = RE_NUMBER.sub(replace_number, sentence)
+        sentence = self._post_replace(sentence)
+
+        return sentence
+
+    def normalize(self, text: str) -> List[str]:
+        sentences = self._split(text)
+        sentences = [self.normalize_sentence(sent) for sent in sentences]
+        return sentences