diff --git a/.github/scripts/test-offline-tts.sh b/.github/scripts/test-offline-tts.sh index baa2b37b..15581eaa 100755 --- a/.github/scripts/test-offline-tts.sh +++ b/.github/scripts/test-offline-tts.sh @@ -43,6 +43,28 @@ for sid in $(seq 0 10); do done rm -rf kokoro-en-v0_19 +log "------------------------------------------------------------" +log "matcha-tts-fa_en-male" +log "------------------------------------------------------------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-tts-fa_en-male.tar.bz2 +tar xvf matcha-tts-fa_en-male.tar.bz2 +rm matcha-tts-fa_en-male.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +$EXE \ + --matcha-acoustic-model=./matcha-tts-fa_en-male/model.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --matcha-tokens=./matcha-tts-fa_en-male/tokens.txt \ + --matcha-data-dir=./matcha-tts-fa_en-male/espeak-ng-data \ + --output-filename=./tts/test-matcha-fa-en-male.wav \ + --num-threads=2 \ + "How are you doing today? این یک نمونه ی تست فارسی است. This is a test." + +rm -rf matcha-tts-fa_en-male +rm hifigan_v2.onnx +ls -lh tts/*.wav + log "------------------------------------------------------------" log "matcha-icefall-en_US-ljspeech" log "------------------------------------------------------------" @@ -64,6 +86,7 @@ $EXE \ rm hifigan_v2.onnx rm -rf matcha-icefall-en_US-ljspeech +ls -lh tts/*.wav log "------------------------------------------------------------" log "matcha-icefall-zh-baker" diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 0650c658..f2fb96e3 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -397,18 +397,28 @@ def get_matcha_models() -> List[TtsModel]: m.dict_dir = m.model_dir + "/dict" m.vocoder = "hifigan_v2.onnx" - english_models = [ + english_persian_models = [ TtsModel( model_dir="matcha-icefall-en_US-ljspeech", acoustic_model_name="model-steps-3.onnx", lang="en", - ) + ), + TtsModel( + model_dir="matcha-tts-fa_en-male", + acoustic_model_name="model.onnx", + lang="fa", + ), + TtsModel( + model_dir="matcha-tts-fa_en-female", + acoustic_model_name="model.onnx", + lang="fa", + ), ] - for m in english_models: + for m in english_persian_models: m.data_dir = f"{m.model_dir}/espeak-ng-data" m.vocoder = "hifigan_v2.onnx" - return chinese_models + english_models + return chinese_models + english_persian_models def get_kokoro_models() -> List[TtsModel]: diff --git a/sherpa-onnx/csrc/offline-tts-matcha-impl.h b/sherpa-onnx/csrc/offline-tts-matcha-impl.h index e717e64f..2299df5a 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-impl.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-impl.h @@ -214,7 +214,7 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { } std::vector token_ids = - frontend_->ConvertTextToTokenIds(text, "en-US"); + frontend_->ConvertTextToTokenIds(text, meta_data.voice); if (token_ids.empty() || (token_ids.size() == 1 && token_ids[0].tokens.empty())) { diff --git a/sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h b/sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h index 06e91011..0bd34463 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h @@ -21,6 +21,8 @@ struct OfflineTtsMatchaModelMetaData { int32_t has_espeak = 0; int32_t use_eos_bos = 0; int32_t pad_id = 0; + + std::string voice; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-matcha-model.cc b/sherpa-onnx/csrc/offline-tts-matcha-model.cc index afea546d..fccb9012 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-model.cc +++ b/sherpa-onnx/csrc/offline-tts-matcha-model.cc @@ -83,15 +83,32 @@ class OfflineTtsMatchaModel::Impl { Ort::Value sid_tensor = Ort::Value::CreateTensor(memory_info, &sid, 1, &scale_shape, 1); + std::array scales = {noise_scale, length_scale}; + int64_t scales_shape = 2; + + Ort::Value scales_tensor = Ort::Value::CreateTensor( + memory_info, scales.data(), scales.size(), &scales_shape, 1); + std::vector inputs; inputs.reserve(5); inputs.push_back(std::move(x)); inputs.push_back(std::move(x_length)); - inputs.push_back(std::move(noise_scale_tensor)); - inputs.push_back(std::move(length_scale_tensor)); + if (input_names_[2] == "scales") { + // for models from + // https://github.com/shivammehta25/Matcha-TTS + inputs.push_back(std::move(scales_tensor)); + } else { + // for models from icefall + inputs.push_back(std::move(noise_scale_tensor)); + inputs.push_back(std::move(length_scale_tensor)); + } if (input_names_.size() == 5 && input_names_.back() == "sid") { + // for models from icefall inputs.push_back(std::move(sid_tensor)); + + // Note that we have not supported multi-speaker tts models from + // https://github.com/shivammehta25/Matcha-TTS } auto out = @@ -145,6 +162,8 @@ class OfflineTtsMatchaModel::Impl { SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice", + "en-us"); } private: