Add C++ runtime for MeloTTS (#1138)
This commit is contained in:
@@ -78,6 +78,10 @@ sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt
|
||||
git diff
|
||||
popd
|
||||
|
||||
if [[ $model_dir == vits-melo-tts-zh_en ]]; then
|
||||
lang=zh_en
|
||||
fi
|
||||
|
||||
for arch in arm64-v8a armeabi-v7a x86_64 x86; do
|
||||
log "------------------------------------------------------------"
|
||||
log "build tts apk for $arch"
|
||||
|
||||
@@ -76,6 +76,10 @@ sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt
|
||||
git diff
|
||||
popd
|
||||
|
||||
if [[ $model_dir == vits-melo-tts-zh_en ]]; then
|
||||
lang=zh_en
|
||||
fi
|
||||
|
||||
for arch in arm64-v8a armeabi-v7a x86_64 x86; do
|
||||
log "------------------------------------------------------------"
|
||||
log "build tts apk for $arch"
|
||||
|
||||
@@ -312,6 +312,11 @@ def get_vits_models() -> List[TtsModel]:
|
||||
model_name="vits-zh-hf-fanchen-wnj.onnx",
|
||||
lang="zh",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-melo-tts-zh_en",
|
||||
model_name="model.onnx",
|
||||
lang="zh",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-fanchen-C",
|
||||
model_name="vits-zh-hf-fanchen-C.onnx",
|
||||
@@ -339,18 +344,21 @@ def get_vits_models() -> List[TtsModel]:
|
||||
),
|
||||
]
|
||||
|
||||
rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"]
|
||||
rule_fsts = ["phone.fst", "date.fst", "number.fst"]
|
||||
for m in chinese_models:
|
||||
s = [f"{m.model_dir}/{r}" for r in rule_fsts]
|
||||
if "vits-zh-hf" in m.model_dir or "sherpa-onnx-vits-zh-ll" == m.model_dir:
|
||||
if (
|
||||
"vits-zh-hf" in m.model_dir
|
||||
or "sherpa-onnx-vits-zh-ll" == m.model_dir
|
||||
or "melo-tts" in m.model_dir
|
||||
):
|
||||
s = s[:-1]
|
||||
m.dict_dir = m.model_dir + "/dict"
|
||||
else:
|
||||
m.rule_fars = f"{m.model_dir}/rule.far"
|
||||
|
||||
m.rule_fsts = ",".join(s)
|
||||
|
||||
if "vits-zh-hf" not in m.model_dir and "zh-ll" not in m.model_dir:
|
||||
m.rule_fars = f"{m.model_dir}/rule.far"
|
||||
|
||||
all_models = chinese_models + [
|
||||
TtsModel(
|
||||
model_dir="vits-cantonese-hf-xiaomaiiwn",
|
||||
|
||||
@@ -17,7 +17,7 @@ topics:
|
||||
- voice-activity-detection
|
||||
|
||||
# remember to change the version in ../sherpa_onnx_macos/macos/sherpa_onnx.podspec
|
||||
version: 1.10.15
|
||||
version: 1.10.16
|
||||
|
||||
homepage: https://github.com/k2-fsa/sherpa-onnx
|
||||
|
||||
|
||||
@@ -6,9 +6,6 @@ from typing import List, Optional
|
||||
|
||||
import jinja2
|
||||
|
||||
# pip install iso639-lang
|
||||
from iso639 import Lang
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
@@ -37,13 +34,6 @@ class TtsModel:
|
||||
data_dir: Optional[str] = None
|
||||
dict_dir: Optional[str] = None
|
||||
is_char: bool = False
|
||||
lang_iso_639_3: str = ""
|
||||
|
||||
|
||||
def convert_lang_to_iso_639_3(models: List[TtsModel]):
|
||||
for m in models:
|
||||
if m.lang_iso_639_3 == "":
|
||||
m.lang_iso_639_3 = Lang(m.lang).pt3
|
||||
|
||||
|
||||
def get_coqui_models() -> List[TtsModel]:
|
||||
@@ -312,6 +302,11 @@ def get_vits_models() -> List[TtsModel]:
|
||||
model_name="vits-zh-hf-fanchen-wnj.onnx",
|
||||
lang="zh",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-melo-tts-zh_en",
|
||||
model_name="model.onnx",
|
||||
lang="zh_en",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="vits-zh-hf-fanchen-C",
|
||||
model_name="vits-zh-hf-fanchen-C.onnx",
|
||||
@@ -332,26 +327,33 @@ def get_vits_models() -> List[TtsModel]:
|
||||
model_name="vits-zh-hf-fanchen-unity.onnx",
|
||||
lang="zh",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="sherpa-onnx-vits-zh-ll",
|
||||
model_name="model.onnx",
|
||||
lang="zh",
|
||||
),
|
||||
]
|
||||
|
||||
rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"]
|
||||
rule_fsts = ["phone.fst", "date.fst", "number.fst"]
|
||||
for m in chinese_models:
|
||||
s = [f"{m.model_dir}/{r}" for r in rule_fsts]
|
||||
if "vits-zh-hf" in m.model_dir:
|
||||
if (
|
||||
"vits-zh-hf" in m.model_dir
|
||||
or "sherpa-onnx-vits-zh-ll" == m.model_dir
|
||||
or "melo-tts" in m.model_dir
|
||||
):
|
||||
s = s[:-1]
|
||||
m.dict_dir = m.model_dir + "/dict"
|
||||
else:
|
||||
m.rule_fars = f"{m.model_dir}/rule.far"
|
||||
|
||||
m.rule_fsts = ",".join(s)
|
||||
|
||||
if "vits-zh-hf" not in m.model_dir:
|
||||
m.rule_fars = f"{m.model_dir}/rule.far"
|
||||
|
||||
all_models = chinese_models + [
|
||||
TtsModel(
|
||||
model_dir="vits-cantonese-hf-xiaomaiiwn",
|
||||
model_name="vits-cantonese-hf-xiaomaiiwn.onnx",
|
||||
lang="cantonese",
|
||||
lang_iso_639_3="yue",
|
||||
rule_fsts="vits-cantonese-hf-xiaomaiiwn/rule.fst",
|
||||
),
|
||||
# English (US)
|
||||
@@ -374,7 +376,6 @@ def main():
|
||||
all_model_list += get_piper_models()
|
||||
all_model_list += get_mimic3_models()
|
||||
all_model_list += get_coqui_models()
|
||||
convert_lang_to_iso_639_3(all_model_list)
|
||||
|
||||
num_models = len(all_model_list)
|
||||
|
||||
|
||||
6
scripts/melo-tts/README.md
Normal file
6
scripts/melo-tts/README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
# Introduction
|
||||
|
||||
Models in this directory are converted from
|
||||
https://github.com/myshell-ai/MeloTTS
|
||||
|
||||
Note there is only a single female speaker in the model.
|
||||
@@ -8,7 +8,6 @@ from melo.text import language_id_map, language_tone_start_map
|
||||
from melo.text.chinese import pinyin_to_symbol_map
|
||||
from melo.text.english import eng_dict, refine_syllables
|
||||
from pypinyin import Style, lazy_pinyin, phrases_dict, pinyin_dict
|
||||
from melo.text.symbols import language_tone_start_map
|
||||
|
||||
for k, v in pinyin_to_symbol_map.items():
|
||||
if isinstance(v, list):
|
||||
@@ -82,6 +81,7 @@ def generate_tokens(symbol_list):
|
||||
def generate_lexicon():
|
||||
word_dict = pinyin_dict.pinyin_dict
|
||||
phrases = phrases_dict.phrases_dict
|
||||
eng_dict["kaldi"] = [["K", "AH0"], ["L", "D", "IH0"]]
|
||||
with open("lexicon.txt", "w", encoding="utf-8") as f:
|
||||
for word in eng_dict:
|
||||
phones, tones = refine_syllables(eng_dict[word])
|
||||
@@ -237,9 +237,11 @@ def main():
|
||||
meta_data = {
|
||||
"model_type": "melo-vits",
|
||||
"comment": "melo",
|
||||
"version": 2,
|
||||
"language": "Chinese + English",
|
||||
"add_blank": int(model.hps.data.add_blank),
|
||||
"n_speakers": 1,
|
||||
"jieba": 1,
|
||||
"sample_rate": model.hps.data.sampling_rate,
|
||||
"bert_dim": 1024,
|
||||
"ja_bert_dim": 768,
|
||||
|
||||
@@ -12,7 +12,7 @@ function install() {
|
||||
cd MeloTTS
|
||||
pip install -r ./requirements.txt
|
||||
|
||||
pip install soundfile onnx onnxruntime
|
||||
pip install soundfile onnx==1.15.0 onnxruntime==1.16.3
|
||||
|
||||
python3 -m unidic download
|
||||
popd
|
||||
|
||||
@@ -135,28 +135,11 @@ class OnnxModel:
|
||||
def main():
|
||||
lexicon = Lexicon(lexion_filename="./lexicon.txt", tokens_filename="./tokens.txt")
|
||||
|
||||
text = "永远相信,美好的事情即将发生。"
|
||||
text = "这是一个使用 next generation kaldi 的 text to speech 中英文例子. Thank you! 你觉得如何呢? are you ok? Fantastic! How about you?"
|
||||
s = jieba.cut(text, HMM=True)
|
||||
|
||||
phones, tones = lexicon.convert(s)
|
||||
|
||||
en_text = "how are you ?".split()
|
||||
|
||||
phones_en, tones_en = lexicon.convert(en_text)
|
||||
phones += [0]
|
||||
tones += [0]
|
||||
|
||||
phones += phones_en
|
||||
tones += tones_en
|
||||
|
||||
text = "多音字测试, 银行,行不行?长沙长大"
|
||||
s = jieba.cut(text, HMM=True)
|
||||
|
||||
phones2, tones2 = lexicon.convert(s)
|
||||
|
||||
phones += phones2
|
||||
tones += tones2
|
||||
|
||||
model = OnnxModel("./model.onnx")
|
||||
|
||||
if model.add_blank:
|
||||
|
||||
Reference in New Issue
Block a user