Add C++ runtime for MeloTTS (#1138)

2024-07-16 15:55:02 +08:00
parent 95485411fa
commit 960eb7529e
51 changed files with 693 additions and 156 deletions
--- a/scripts/melo-tts/README.md
+++ b/scripts/melo-tts/README.md
@@ -0,0 +1,6 @@
+# Introduction
+
+Models in this directory are converted from
+https://github.com/myshell-ai/MeloTTS
+
+Note there is only a single female speaker in the model.
--- a/scripts/melo-tts/export-onnx.py
+++ b/scripts/melo-tts/export-onnx.py
@@ -8,7 +8,6 @@ from melo.text import language_id_map, language_tone_start_map
 from melo.text.chinese import pinyin_to_symbol_map
 from melo.text.english import eng_dict, refine_syllables
 from pypinyin import Style, lazy_pinyin, phrases_dict, pinyin_dict
-from melo.text.symbols import language_tone_start_map

 for k, v in pinyin_to_symbol_map.items():
    if isinstance(v, list):
@@ -82,6 +81,7 @@ def generate_tokens(symbol_list):
 def generate_lexicon():
    word_dict = pinyin_dict.pinyin_dict
    phrases = phrases_dict.phrases_dict
+    eng_dict["kaldi"] = [["K", "AH0"], ["L", "D", "IH0"]]
    with open("lexicon.txt", "w", encoding="utf-8") as f:
        for word in eng_dict:
            phones, tones = refine_syllables(eng_dict[word])
@@ -237,9 +237,11 @@ def main():
    meta_data = {
        "model_type": "melo-vits",
        "comment": "melo",
+        "version": 2,
        "language": "Chinese + English",
        "add_blank": int(model.hps.data.add_blank),
        "n_speakers": 1,
+        "jieba": 1,
        "sample_rate": model.hps.data.sampling_rate,
        "bert_dim": 1024,
        "ja_bert_dim": 768,
--- a/scripts/melo-tts/run.sh
+++ b/scripts/melo-tts/run.sh
@@ -12,7 +12,7 @@ function install() {
  cd MeloTTS
  pip install -r ./requirements.txt

-  pip install soundfile onnx onnxruntime
+  pip install soundfile onnx==1.15.0 onnxruntime==1.16.3

  python3 -m unidic download
  popd
--- a/scripts/melo-tts/test.py
+++ b/scripts/melo-tts/test.py
@@ -135,28 +135,11 @@ class OnnxModel:
 def main():
    lexicon = Lexicon(lexion_filename="./lexicon.txt", tokens_filename="./tokens.txt")

-    text = "永远相信，美好的事情即将发生。"
+    text = "这是一个使用 next generation kaldi 的 text to speech 中英文例子. Thank you! 你觉得如何呢? are you ok? Fantastic! How about you?"
    s = jieba.cut(text, HMM=True)

    phones, tones = lexicon.convert(s)

-    en_text = "how are you ?".split()
-
-    phones_en, tones_en = lexicon.convert(en_text)
-    phones += [0]
-    tones += [0]
-
-    phones += phones_en
-    tones += tones_en
-
-    text = "多音字测试， 银行，行不行？长沙长大"
-    s = jieba.cut(text, HMM=True)
-
-    phones2, tones2 = lexicon.convert(s)
-
-    phones += phones2
-    tones += tones2
-
    model = OnnxModel("./model.onnx")

    if model.add_blank: