Add C++ support for MatchaTTS models not from icefall. (#1834)
This commit is contained in:
23
.github/scripts/test-offline-tts.sh
vendored
23
.github/scripts/test-offline-tts.sh
vendored
@@ -43,6 +43,28 @@ for sid in $(seq 0 10); do
|
||||
done
|
||||
rm -rf kokoro-en-v0_19
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "matcha-tts-fa_en-male"
|
||||
log "------------------------------------------------------------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-tts-fa_en-male.tar.bz2
|
||||
tar xvf matcha-tts-fa_en-male.tar.bz2
|
||||
rm matcha-tts-fa_en-male.tar.bz2
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||
|
||||
$EXE \
|
||||
--matcha-acoustic-model=./matcha-tts-fa_en-male/model.onnx \
|
||||
--matcha-vocoder=./hifigan_v2.onnx \
|
||||
--matcha-tokens=./matcha-tts-fa_en-male/tokens.txt \
|
||||
--matcha-data-dir=./matcha-tts-fa_en-male/espeak-ng-data \
|
||||
--output-filename=./tts/test-matcha-fa-en-male.wav \
|
||||
--num-threads=2 \
|
||||
"How are you doing today? این یک نمونه ی تست فارسی است. This is a test."
|
||||
|
||||
rm -rf matcha-tts-fa_en-male
|
||||
rm hifigan_v2.onnx
|
||||
ls -lh tts/*.wav
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "matcha-icefall-en_US-ljspeech"
|
||||
log "------------------------------------------------------------"
|
||||
@@ -64,6 +86,7 @@ $EXE \
|
||||
|
||||
rm hifigan_v2.onnx
|
||||
rm -rf matcha-icefall-en_US-ljspeech
|
||||
ls -lh tts/*.wav
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "matcha-icefall-zh-baker"
|
||||
|
||||
@@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]:
|
||||
m.dict_dir = m.model_dir + "/dict"
|
||||
m.vocoder = "hifigan_v2.onnx"
|
||||
|
||||
english_models = [
|
||||
english_persian_models = [
|
||||
TtsModel(
|
||||
model_dir="matcha-icefall-en_US-ljspeech",
|
||||
acoustic_model_name="model-steps-3.onnx",
|
||||
lang="en",
|
||||
)
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="matcha-tts-fa_en-male",
|
||||
acoustic_model_name="model.onnx",
|
||||
lang="fa",
|
||||
),
|
||||
TtsModel(
|
||||
model_dir="matcha-tts-fa_en-female",
|
||||
acoustic_model_name="model.onnx",
|
||||
lang="fa",
|
||||
),
|
||||
]
|
||||
for m in english_models:
|
||||
for m in english_persian_models:
|
||||
m.data_dir = f"{m.model_dir}/espeak-ng-data"
|
||||
m.vocoder = "hifigan_v2.onnx"
|
||||
|
||||
return chinese_models + english_models
|
||||
return chinese_models + english_persian_models
|
||||
|
||||
|
||||
def get_kokoro_models() -> List[TtsModel]:
|
||||
|
||||
@@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> token_ids =
|
||||
frontend_->ConvertTextToTokenIds(text, "en-US");
|
||||
frontend_->ConvertTextToTokenIds(text, meta_data.voice);
|
||||
|
||||
if (token_ids.empty() ||
|
||||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
|
||||
|
||||
@@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData {
|
||||
int32_t has_espeak = 0;
|
||||
int32_t use_eos_bos = 0;
|
||||
int32_t pad_id = 0;
|
||||
|
||||
std::string voice;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl {
|
||||
Ort::Value sid_tensor =
|
||||
Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1);
|
||||
|
||||
std::array<float, 2> scales = {noise_scale, length_scale};
|
||||
int64_t scales_shape = 2;
|
||||
|
||||
Ort::Value scales_tensor = Ort::Value::CreateTensor(
|
||||
memory_info, scales.data(), scales.size(), &scales_shape, 1);
|
||||
|
||||
std::vector<Ort::Value> inputs;
|
||||
inputs.reserve(5);
|
||||
inputs.push_back(std::move(x));
|
||||
inputs.push_back(std::move(x_length));
|
||||
inputs.push_back(std::move(noise_scale_tensor));
|
||||
inputs.push_back(std::move(length_scale_tensor));
|
||||
if (input_names_[2] == "scales") {
|
||||
// for models from
|
||||
// https://github.com/shivammehta25/Matcha-TTS
|
||||
inputs.push_back(std::move(scales_tensor));
|
||||
} else {
|
||||
// for models from icefall
|
||||
inputs.push_back(std::move(noise_scale_tensor));
|
||||
inputs.push_back(std::move(length_scale_tensor));
|
||||
}
|
||||
|
||||
if (input_names_.size() == 5 && input_names_.back() == "sid") {
|
||||
// for models from icefall
|
||||
inputs.push_back(std::move(sid_tensor));
|
||||
|
||||
// Note that we have not supported multi-speaker tts models from
|
||||
// https://github.com/shivammehta25/Matcha-TTS
|
||||
}
|
||||
|
||||
auto out =
|
||||
@@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl {
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id");
|
||||
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
|
||||
"en-us");
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
Reference in New Issue
Block a user