Support heteronyms in Chinese TTS (#738)

This commit is contained in:
Fangjun Kuang
2024-04-08 11:01:30 +08:00
committed by GitHub
parent c1c0f5bafd
commit a5f8fbc83f
49 changed files with 308 additions and 143 deletions

View File

@@ -56,6 +56,11 @@ sed -i.bak s/"lang = null"/"lang = \"$lang_iso_639_3\""/ ./TtsEngine.kt
sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./TtsEngine.kt
{% endif %}
{% if tts_model.rule_fars %}
rule_fars={{ tts_model.rule_fars }}
sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./TtsEngine.kt
{% endif %}
{% if tts_model.data_dir %}
data_dir={{ tts_model.data_dir }}
sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./TtsEngine.kt

View File

@@ -54,6 +54,11 @@ sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt
sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt
{% endif %}
{% if tts_model.rule_fars %}
rule_fars={{ tts_model.rule_fars }}
sed -i.bak s%"ruleFsts = null"%"ruleFars = \"$rule_fars\""% ./MainActivity.kt
{% endif %}
{% if tts_model.data_dir %}
data_dir={{ tts_model.data_dir }}
sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt

View File

@@ -33,6 +33,7 @@ class TtsModel:
model_name: str = ""
lang: str = "" # en, zh, fr, de, etc.
rule_fsts: Optional[List[str]] = None
rule_fars: Optional[List[str]] = None
data_dir: Optional[str] = None
is_char: bool = False
lang_iso_639_3: str = ""
@@ -241,98 +242,94 @@ def get_mimic3_models() -> List[TtsModel]:
def get_vits_models() -> List[TtsModel]:
return [
chinese_models = [
# Chinese
TtsModel(
model_dir="vits-icefall-zh-aishell3",
model_name="model.onnx",
lang="zh",
rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/rule.fst",
rule_fsts="vits-icefall-zh-aishell3/phone.fst,vits-icefall-zh-aishell3/date.fst,vits-icefall-zh-aishell3/number.fst,vits-icefall-zh-aishell3/new_heteronym.fst",
rule_fars="vits-icefall-zh-aishell3/rule.far",
),
TtsModel(
model_dir="vits-zh-aishell3",
model_name="vits-aishell3.onnx",
lang="zh",
rule_fsts="vits-zh-aishell3/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-doom",
model_name="doom.onnx",
lang="zh",
rule_fsts="vits-zh-hf-doom/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-echo",
model_name="echo.onnx",
lang="zh",
rule_fsts="vits-zh-hf-echo/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-zenyatta",
model_name="zenyatta.onnx",
lang="zh",
rule_fsts="vits-zh-hf-zenyatta/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-abyssinvoker",
model_name="abyssinvoker.onnx",
lang="zh",
rule_fsts="vits-zh-hf-abyssinvoker/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-keqing",
model_name="keqing.onnx",
lang="zh",
rule_fsts="vits-zh-hf-keqing/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-eula",
model_name="eula.onnx",
lang="zh",
rule_fsts="vits-zh-hf-eula/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-bronya",
model_name="bronya.onnx",
lang="zh",
rule_fsts="vits-zh-hf-bronya/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-theresa",
model_name="theresa.onnx",
lang="zh",
rule_fsts="vits-zh-hf-theresa/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-fanchen-wnj",
model_name="vits-zh-hf-fanchen-wnj.onnx",
lang="zh",
rule_fsts="vits-zh-hf-fanchen-wnj/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-fanchen-C",
model_name="vits-zh-hf-fanchen-C.onnx",
lang="zh",
rule_fsts="vits-zh-hf-fanchen-C/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe",
model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe.onnx",
lang="zh",
rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new",
model_name="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new.onnx",
lang="zh",
rule_fsts="vits-zh-hf-fanchen-ZhiHuiLaoZhe_new/rule.fst",
),
TtsModel(
model_dir="vits-zh-hf-fanchen-unity",
model_name="vits-zh-hf-fanchen-unity.onnx",
lang="zh",
rule_fsts="vits-zh-hf-fanchen-unity/rule.fst",
),
]
rule_fsts = ["phone.fst", "date.fst", "number.fst", "new_heteronym.fst"]
for m in chinese_models:
s = [f"{m.model_dir}/{r}" for r in rule_fsts]
m.rule_fsts = ",".join(s)
m.rule_fars = f"{m.model_dir}/rule.far"
all_models = chinese_models + [
TtsModel(
model_dir="vits-cantonese-hf-xiaomaiiwn",
model_name="vits-cantonese-hf-xiaomaiiwn.onnx",
@@ -346,6 +343,8 @@ def get_vits_models() -> List[TtsModel]:
# fmt: on
]
return all_models
def main():
args = get_args()

View File

@@ -40,6 +40,7 @@ def process_linux(s):
"libpiper_phonemize.so.1",
"libsherpa-onnx-c-api.so",
"libsherpa-onnx-core.so",
"libsherpa-onnx-fstfar.so.7",
"libsherpa-onnx-fst.so.6",
"libsherpa-onnx-kaldifst-core.so",
"libucd.so",
@@ -68,6 +69,7 @@ def process_macos(s):
"libpiper_phonemize.1.dylib",
"libsherpa-onnx-c-api.dylib",
"libsherpa-onnx-core.dylib",
"libsherpa-onnx-fstfar.7.dylib",
"libsherpa-onnx-fst.6.dylib",
"libsherpa-onnx-kaldifst-core.dylib",
"libucd.dylib",
@@ -96,6 +98,7 @@ def process_windows(s, rid):
"piper_phonemize.dll",
"sherpa-onnx-c-api.dll",
"sherpa-onnx-core.dll",
"sherpa-onnx-fstfar.lib",
"sherpa-onnx-fst.lib",
"sherpa-onnx-kaldifst-core.lib",
"ucd.dll",

View File

@@ -67,6 +67,7 @@ namespace SherpaOnnx
Model = new OfflineTtsModelConfig();
RuleFsts = "";
MaxNumSentences = 1;
RuleFars = "";
}
public OfflineTtsModelConfig Model;
@@ -74,6 +75,9 @@ namespace SherpaOnnx
public string RuleFsts;
public int MaxNumSentences;
[MarshalAs(UnmanagedType.LPStr)]
public string RuleFars;
}
public class OfflineTtsGeneratedAudio

View File

@@ -41,6 +41,7 @@ if [ ! -f /tmp/linux/libsherpa-onnx-core.so ]; then
cd ..
rm -v libpiper_phonemize.so libpiper_phonemize.so.1.2.0
rm -v libsherpa-onnx-fst.so
rm -v libsherpa-onnx-fstfar.so
rm -v libonnxruntime.so
rm -v libcargs.so
rm -rf wheel
@@ -67,6 +68,7 @@ if [ ! -f /tmp/macos/libsherpa-onnx-core.dylib ]; then
rm -v libonnxruntime.dylib
rm -v libpiper_phonemize.1.2.0.dylib libpiper_phonemize.dylib
rm -v libsherpa-onnx-fst.dylib
rm -v libsherpa-onnx-fstfar.dylib
rm -rf wheel
ls -lh
cd ..

View File

@@ -2,5 +2,5 @@
package sherpa_onnx
// #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin
// #cgo LDFLAGS: -L ${SRCDIR}/lib/x86_64-apple-darwin -lsherpa-onnx-c-api -lsherpa-onnx-core -lkaldi-native-fbank-core -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fstfar -lsherpa-onnx-fst -lpiper_phonemize -lespeak-ng -lucd -lonnxruntime -Wl,-rpath,${SRCDIR}/lib/x86_64-apple-darwin
import "C"

View File

@@ -554,6 +554,7 @@ type OfflineTtsModelConfig struct {
type OfflineTtsConfig struct {
Model OfflineTtsModelConfig
RuleFsts string
RuleFars string
MaxNumSentences int
}
@@ -583,6 +584,9 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
c.rule_fsts = C.CString(config.RuleFsts)
defer C.free(unsafe.Pointer(c.rule_fsts))
c.rule_fars = C.CString(config.RuleFars)
defer C.free(unsafe.Pointer(c.rule_fars))
c.max_num_sentences = C.int(config.MaxNumSentences)
c.model.vits.model = C.CString(config.Model.Vits.Model)