diff --git a/.github/scripts/test-offline-tts.sh b/.github/scripts/test-offline-tts.sh index 70fd2247..baa2b37b 100755 --- a/.github/scripts/test-offline-tts.sh +++ b/.github/scripts/test-offline-tts.sh @@ -18,6 +18,31 @@ which $EXE # test waves are saved in ./tts mkdir ./tts +log "------------------------------------------------------------" +log "kokoro-en-v0_19" +log "------------------------------------------------------------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 +tar xf kokoro-en-v0_19.tar.bz2 +rm kokoro-en-v0_19.tar.bz2 + +# mapping of sid to voice name +# 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam +# 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis + +for sid in $(seq 0 10); do + $EXE \ + --debug=1 \ + --kokoro-model=./kokoro-en-v0_19/model.onnx \ + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ + --num-threads=2 \ + --sid=$sid \ + --output-filename="./tts/kokoro-$sid.wav" \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." +done +rm -rf kokoro-en-v0_19 + log "------------------------------------------------------------" log "matcha-icefall-en_US-ljspeech" log "------------------------------------------------------------" diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index 350d9c18..ad037438 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -267,6 +267,25 @@ log "Offline TTS test" # test waves are saved in ./tts mkdir ./tts +log "kokoro-en-v0_19 test" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 +tar xf kokoro-en-v0_19.tar.bz2 +rm kokoro-en-v0_19.tar.bz2 + +python3 ./python-api-examples/offline-tts.py \ + --debug=1 \ + --kokoro-model=./kokoro-en-v0_19/model.onnx \ + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ + --num-threads=2 \ + --sid=10 \ + --output-filename="./tts/kokoro-10.wav" \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." + +rm -rf kokoro-en-v0_19 + log "matcha-ljspeech-en test" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 diff --git a/python-api-examples/offline-tts-play.py b/python-api-examples/offline-tts-play.py index 09d03dae..5ece997b 100755 --- a/python-api-examples/offline-tts-play.py +++ b/python-api-examples/offline-tts-play.py @@ -11,7 +11,7 @@ while the model is still generating. Usage: -Example (1/5) +Example (1/6) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./generated.wav \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (2/5) +Example (2/6) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 tar xvf vits-zh-aishell3.tar.bz2 @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./liubei-21.wav \ "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" -Example (3/5) +Example (3/6) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./test-2.wav \ "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" -Example (4/5) +Example (4/6) curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./test-matcha.wav \ "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" -Example (5/5) +Example (5/6) curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 @@ -88,6 +88,22 @@ python3 ./python-api-examples/offline-tts-play.py \ --num-threads=2 \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." +Example (6/6) + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 +tar xf kokoro-en-v0_19.tar.bz2 +rm kokoro-en-v0_19.tar.bz2 + +python3 ./python-api-examples/offline-tts.py \ + --debug=1 \ + --kokoro-model=./kokoro-en-v0_19/model.onnx \ + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ + --num-threads=2 \ + --sid=10 \ + --output-filename="./kokoro-10.wav" \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." You can find more models at https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models @@ -202,6 +218,36 @@ def add_matcha_args(parser): ) +def add_kokoro_args(parser): + parser.add_argument( + "--kokoro-model", + type=str, + default="", + help="Path to model.onnx for kokoro", + ) + + parser.add_argument( + "--kokoro-voices", + type=str, + default="", + help="Path to voices.bin for kokoro", + ) + + parser.add_argument( + "--kokoro-tokens", + type=str, + default="", + help="Path to tokens.txt for kokoro", + ) + + parser.add_argument( + "--kokoro-data-dir", + type=str, + default="", + help="Path to the dict directory of espeak-ng.", + ) + + def get_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter @@ -209,6 +255,7 @@ def get_args(): add_vits_args(parser) add_matcha_args(parser) + add_kokoro_args(parser) parser.add_argument( "--tts-rule-fsts", @@ -407,6 +454,12 @@ def main(): data_dir=args.matcha_data_dir, dict_dir=args.matcha_dict_dir, ), + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig( + model=args.kokoro_model, + voices=args.kokoro_voices, + tokens=args.kokoro_tokens, + data_dir=args.kokoro_data_dir, + ), provider=args.provider, debug=args.debug, num_threads=args.num_threads, diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py index 72bf7795..aace840f 100755 --- a/python-api-examples/offline-tts.py +++ b/python-api-examples/offline-tts.py @@ -12,7 +12,7 @@ generated audio. Usage: -Example (1/5) +Example (1/6) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./generated.wav \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (2/5) +Example (2/6) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2 @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./liubei-21.wav \ "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" -Example (3/5) +Example (3/6) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./test-2.wav \ "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" -Example (4/5) +Example (4/6) curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./test-matcha.wav \ "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" -Example (5/5) +Example (5/6) curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 @@ -89,6 +89,23 @@ python3 ./python-api-examples/offline-tts.py \ --num-threads=2 \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." +Example (6/6) + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 +tar xf kokoro-en-v0_19.tar.bz2 +rm kokoro-en-v0_19.tar.bz2 + +python3 ./python-api-examples/offline-tts.py \ + --debug=1 \ + --kokoro-model=./kokoro-en-v0_19/model.onnx \ + --kokoro-voices=./kokoro-en-v0_19/voices.bin \ + --kokoro-tokens=./kokoro-en-v0_19/tokens.txt \ + --kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \ + --num-threads=2 \ + --sid=10 \ + --output-filename="./kokoro-10.wav" \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." + You can find more models at https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models @@ -188,6 +205,36 @@ def add_matcha_args(parser): ) +def add_kokoro_args(parser): + parser.add_argument( + "--kokoro-model", + type=str, + default="", + help="Path to model.onnx for kokoro", + ) + + parser.add_argument( + "--kokoro-voices", + type=str, + default="", + help="Path to voices.bin for kokoro", + ) + + parser.add_argument( + "--kokoro-tokens", + type=str, + default="", + help="Path to tokens.txt for kokoro", + ) + + parser.add_argument( + "--kokoro-data-dir", + type=str, + default="", + help="Path to the dict directory of espeak-ng.", + ) + + def get_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter @@ -195,6 +242,7 @@ def get_args(): add_vits_args(parser) add_matcha_args(parser) + add_kokoro_args(parser) parser.add_argument( "--tts-rule-fsts", @@ -206,7 +254,7 @@ def get_args(): parser.add_argument( "--max-num-sentences", type=int, - default=2, + default=1, help="""Max number of sentences in a batch to avoid OOM if the input text is very long. Set it to -1 to process all the sentences in a single batch. A smaller value does not mean it is slower compared @@ -289,6 +337,12 @@ def main(): data_dir=args.matcha_data_dir, dict_dir=args.matcha_dict_dir, ), + kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig( + model=args.kokoro_model, + voices=args.kokoro_voices, + tokens=args.kokoro_tokens, + data_dir=args.kokoro_data_dir, + ), provider=args.provider, debug=args.debug, num_threads=args.num_threads, diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index f146b09e..d5303b75 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS) offline-tts-character-frontend.cc offline-tts-frontend.cc offline-tts-impl.cc + offline-tts-kokoro-model-config.cc + offline-tts-kokoro-model.cc offline-tts-matcha-model-config.cc offline-tts-matcha-model.cc offline-tts-model-config.cc diff --git a/sherpa-onnx/csrc/melo-tts-lexicon.h b/sherpa-onnx/csrc/melo-tts-lexicon.h index e91cf33f..ef7dd029 100644 --- a/sherpa-onnx/csrc/melo-tts-lexicon.h +++ b/sherpa-onnx/csrc/melo-tts-lexicon.h @@ -11,7 +11,7 @@ #include #include "sherpa-onnx/csrc/offline-tts-frontend.h" -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/offline-tts-character-frontend.h b/sherpa-onnx/csrc/offline-tts-character-frontend.h index fcd2f6dd..55bf6a70 100644 --- a/sherpa-onnx/csrc/offline-tts-character-frontend.h +++ b/sherpa-onnx/csrc/offline-tts-character-frontend.h @@ -10,7 +10,7 @@ #include #include "sherpa-onnx/csrc/offline-tts-frontend.h" -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/offline-tts-impl.cc b/sherpa-onnx/csrc/offline-tts-impl.cc index 92ccb7fd..199b0f79 100644 --- a/sherpa-onnx/csrc/offline-tts-impl.cc +++ b/sherpa-onnx/csrc/offline-tts-impl.cc @@ -16,6 +16,7 @@ #include "rawfile/raw_file_manager.h" #endif +#include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h" #include "sherpa-onnx/csrc/offline-tts-matcha-impl.h" #include "sherpa-onnx/csrc/offline-tts-vits-impl.h" @@ -37,8 +38,11 @@ std::unique_ptr OfflineTtsImpl::Create( const OfflineTtsConfig &config) { if (!config.model.vits.model.empty()) { return std::make_unique(config); + } else if (!config.model.matcha.acoustic_model.empty()) { + return std::make_unique(config); } - return std::make_unique(config); + + return std::make_unique(config); } template @@ -46,9 +50,11 @@ std::unique_ptr OfflineTtsImpl::Create( Manager *mgr, const OfflineTtsConfig &config) { if (!config.model.vits.model.empty()) { return std::make_unique(mgr, config); + } else if (!config.model.matcha.acoustic_model.empty()) { + return std::make_unique(mgr, config); } - return std::make_unique(mgr, config); + return std::make_unique(mgr, config); } #if __ANDROID_API__ >= 9 diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h new file mode 100644 index 00000000..4c3efbf6 --- /dev/null +++ b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h @@ -0,0 +1,376 @@ +// sherpa-onnx/csrc/offline-tts-kokoro-impl.h +// +// Copyright (c) 2025 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_ + +#include +#include +#include +#include +#include + +#include "fst/extensions/far/far.h" +#include "kaldifst/csrc/kaldi-fst-io.h" +#include "kaldifst/csrc/text-normalizer.h" +#include "sherpa-onnx/csrc/lexicon.h" +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/offline-tts-frontend.h" +#include "sherpa-onnx/csrc/offline-tts-impl.h" +#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" +#include "sherpa-onnx/csrc/text-utils.h" + +namespace sherpa_onnx { + +class OfflineTtsKokoroImpl : public OfflineTtsImpl { + public: + explicit OfflineTtsKokoroImpl(const OfflineTtsConfig &config) + : config_(config), + model_(std::make_unique(config.model)) { + InitFrontend(); + + if (!config.rule_fsts.empty()) { + std::vector files; + SplitStringToVector(config.rule_fsts, ",", false, &files); + tn_list_.reserve(files.size()); + for (const auto &f : files) { + if (config.model.debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); +#else + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); +#endif + } + tn_list_.push_back(std::make_unique(f)); + } + } + + if (!config.rule_fars.empty()) { + if (config.model.debug) { + SHERPA_ONNX_LOGE("Loading FST archives"); + } + std::vector files; + SplitStringToVector(config.rule_fars, ",", false, &files); + + tn_list_.reserve(files.size() + tn_list_.size()); + + for (const auto &f : files) { + if (config.model.debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); +#else + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); +#endif + } + std::unique_ptr> reader( + fst::FarReader::Open(f)); + for (; !reader->Done(); reader->Next()) { + std::unique_ptr r( + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); + + tn_list_.push_back( + std::make_unique(std::move(r))); + } + } + + if (config.model.debug) { + SHERPA_ONNX_LOGE("FST archives loaded!"); + } + } + } + + template + OfflineTtsKokoroImpl(Manager *mgr, const OfflineTtsConfig &config) + : config_(config), + model_(std::make_unique(mgr, config.model)) { + InitFrontend(mgr); + + if (!config.rule_fsts.empty()) { + std::vector files; + SplitStringToVector(config.rule_fsts, ",", false, &files); + tn_list_.reserve(files.size()); + for (const auto &f : files) { + if (config.model.debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str()); +#else + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); +#endif + } + auto buf = ReadFile(mgr, f); + std::istrstream is(buf.data(), buf.size()); + tn_list_.push_back(std::make_unique(is)); + } + } + + if (!config.rule_fars.empty()) { + std::vector files; + SplitStringToVector(config.rule_fars, ",", false, &files); + tn_list_.reserve(files.size() + tn_list_.size()); + + for (const auto &f : files) { + if (config.model.debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str()); +#else + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); +#endif + } + + auto buf = ReadFile(mgr, f); + + std::unique_ptr s( + new std::istrstream(buf.data(), buf.size())); + + std::unique_ptr> reader( + fst::FarReader::Open(std::move(s))); + + for (; !reader->Done(); reader->Next()) { + std::unique_ptr r( + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); + + tn_list_.push_back( + std::make_unique(std::move(r))); + } // for (; !reader->Done(); reader->Next()) + } // for (const auto &f : files) + } // if (!config.rule_fars.empty()) + } + + int32_t SampleRate() const override { + return model_->GetMetaData().sample_rate; + } + + int32_t NumSpeakers() const override { + return model_->GetMetaData().num_speakers; + } + + GeneratedAudio Generate( + const std::string &_text, int64_t sid = 0, float speed = 1.0, + GeneratedAudioCallback callback = nullptr) const override { + const auto &meta_data = model_->GetMetaData(); + int32_t num_speakers = meta_data.num_speakers; + + if (num_speakers == 0 && sid != 0) { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "This is a single-speaker model and supports only sid 0. Given sid: " + "%{public}d. sid is ignored", + static_cast(sid)); +#else + SHERPA_ONNX_LOGE( + "This is a single-speaker model and supports only sid 0. Given sid: " + "%d. sid is ignored", + static_cast(sid)); +#endif + } + + if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "This model contains only %{public}d speakers. sid should be in the " + "range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0", + num_speakers, 0, num_speakers - 1, static_cast(sid)); +#else + SHERPA_ONNX_LOGE( + "This model contains only %d speakers. sid should be in the range " + "[%d, %d]. Given: %d. Use sid=0", + num_speakers, 0, num_speakers - 1, static_cast(sid)); +#endif + sid = 0; + } + + std::string text = _text; + if (config_.model.debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str()); +#else + SHERPA_ONNX_LOGE("Raw text: %s", text.c_str()); +#endif + } + + if (!tn_list_.empty()) { + for (const auto &tn : tn_list_) { + text = tn->Normalize(text); + if (config_.model.debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str()); +#else + SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str()); +#endif + } + } + } + + std::vector token_ids = + frontend_->ConvertTextToTokenIds(text, "en-us"); + + if (token_ids.empty() || + (token_ids.size() == 1 && token_ids[0].tokens.empty())) { +#if __OHOS__ + SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs", + text.c_str()); +#else + SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str()); +#endif + return {}; + } + + std::vector> x; + + x.reserve(token_ids.size()); + + for (auto &i : token_ids) { + x.push_back(std::move(i.tokens)); + } + + int32_t x_size = static_cast(x.size()); + + if (config_.max_num_sentences != 1) { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "max_num_sentences (%{public}d) != 1 is ignored for Kokoro TTS " + "models", + config_.max_num_sentences); +#else + SHERPA_ONNX_LOGE( + "max_num_sentences (%d) != 1 is ignored for Kokoro TTS models", + config_.max_num_sentences); +#endif + } + + // the input text is too long, we process sentences within it in batches + // to avoid OOM. Batch size is config_.max_num_sentences + std::vector> batch_x; + + int32_t batch_size = 1; + batch_x.reserve(config_.max_num_sentences); + int32_t num_batches = x_size / batch_size; + + if (config_.model.debug) { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "Split it into %{public}d batches. batch size: " + "%{public}d. Number of sentences: %{public}d", + num_batches, batch_size, x_size); +#else + SHERPA_ONNX_LOGE( + "Split it into %d batches. batch size: %d. Number " + "of sentences: %d", + num_batches, batch_size, x_size); +#endif + } + + GeneratedAudio ans; + + int32_t should_continue = 1; + + int32_t k = 0; + + for (int32_t b = 0; b != num_batches && should_continue; ++b) { + batch_x.clear(); + for (int32_t i = 0; i != batch_size; ++i, ++k) { + batch_x.push_back(std::move(x[k])); + } + + auto audio = Process(batch_x, sid, speed); + ans.sample_rate = audio.sample_rate; + ans.samples.insert(ans.samples.end(), audio.samples.begin(), + audio.samples.end()); + if (callback) { + should_continue = callback(audio.samples.data(), audio.samples.size(), + (b + 1) * 1.0 / num_batches); + // Caution(fangjun): audio is freed when the callback returns, so users + // should copy the data if they want to access the data after + // the callback returns to avoid segmentation fault. + } + } + + batch_x.clear(); + while (k < static_cast(x.size()) && should_continue) { + batch_x.push_back(std::move(x[k])); + + ++k; + } + + if (!batch_x.empty()) { + auto audio = Process(batch_x, sid, speed); + ans.sample_rate = audio.sample_rate; + ans.samples.insert(ans.samples.end(), audio.samples.begin(), + audio.samples.end()); + if (callback) { + callback(audio.samples.data(), audio.samples.size(), 1.0); + // Caution(fangjun): audio is freed when the callback returns, so users + // should copy the data if they want to access the data after + // the callback returns to avoid segmentation fault. + } + } + + return ans; + } + + private: + template + void InitFrontend(Manager *mgr) { + const auto &meta_data = model_->GetMetaData(); + frontend_ = std::make_unique( + mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir, + meta_data); + } + + void InitFrontend() { + const auto &meta_data = model_->GetMetaData(); + + frontend_ = std::make_unique( + config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data); + } + + GeneratedAudio Process(const std::vector> &tokens, + int32_t sid, float speed) const { + int32_t num_tokens = 0; + for (const auto &k : tokens) { + num_tokens += k.size(); + } + + std::vector x; + x.reserve(num_tokens); + for (const auto &k : tokens) { + x.insert(x.end(), k.begin(), k.end()); + } + + auto memory_info = + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); + + std::array x_shape = {1, static_cast(x.size())}; + Ort::Value x_tensor = Ort::Value::CreateTensor( + memory_info, x.data(), x.size(), x_shape.data(), x_shape.size()); + + Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed); + + std::vector audio_shape = + audio.GetTensorTypeAndShapeInfo().GetShape(); + + int64_t total = 1; + // The output shape may be (1, 1, total) or (1, total) or (total,) + for (auto i : audio_shape) { + total *= i; + } + + const float *p = audio.GetTensorData(); + + GeneratedAudio ans; + ans.sample_rate = model_->GetMetaData().sample_rate; + ans.samples = std::vector(p, p + total); + return ans; + } + + private: + OfflineTtsConfig config_; + std::unique_ptr model_; + std::vector> tn_list_; + std::unique_ptr frontend_; +}; + +} // namespace sherpa_onnx +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_ diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc new file mode 100644 index 00000000..3eb5ad7e --- /dev/null +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc @@ -0,0 +1,96 @@ +// sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" + +#include + +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/macros.h" + +namespace sherpa_onnx { + +void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { + po->Register("kokoro-model", &model, "Path to Kokoro model"); + po->Register("kokoro-voices", &voices, + "Path to voices.bin for Kokoro models"); + po->Register("kokoro-tokens", &tokens, + "Path to tokens.txt for Kokoro models"); + po->Register("kokoro-data-dir", &data_dir, + "Path to the directory containing dict for espeak-ng."); + po->Register("kokoro-length-scale", &length_scale, + "Speech speed. Larger->Slower; Smaller->faster."); +} + +bool OfflineTtsKokoroModelConfig::Validate() const { + if (model.empty()) { + SHERPA_ONNX_LOGE("Please provide --kokoro-model"); + return false; + } + + if (!FileExists(model)) { + SHERPA_ONNX_LOGE("--kokoro-model: '%s' does not exist", model.c_str()); + return false; + } + + if (tokens.empty()) { + SHERPA_ONNX_LOGE("Please provide --kokoro-tokens"); + return false; + } + + if (!FileExists(tokens)) { + SHERPA_ONNX_LOGE("--kokoro-tokens: '%s' does not exist", tokens.c_str()); + return false; + } + + if (data_dir.empty()) { + SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir"); + return false; + } + + if (!FileExists(data_dir + "/phontab")) { + SHERPA_ONNX_LOGE( + "'%s/phontab' does not exist. Please check --kokoro-data-dir", + data_dir.c_str()); + return false; + } + + if (!FileExists(data_dir + "/phonindex")) { + SHERPA_ONNX_LOGE( + "'%s/phonindex' does not exist. Please check --kokoro-data-dir", + data_dir.c_str()); + return false; + } + + if (!FileExists(data_dir + "/phondata")) { + SHERPA_ONNX_LOGE( + "'%s/phondata' does not exist. Please check --kokoro-data-dir", + data_dir.c_str()); + return false; + } + + if (!FileExists(data_dir + "/intonations")) { + SHERPA_ONNX_LOGE( + "'%s/intonations' does not exist. Please check --kokoro-data-dir", + data_dir.c_str()); + return false; + } + + return true; +} + +std::string OfflineTtsKokoroModelConfig::ToString() const { + std::ostringstream os; + + os << "OfflineTtsKokoroModelConfig("; + os << "model=\"" << model << "\", "; + os << "voices=\"" << voices << "\", "; + os << "tokens=\"" << tokens << "\", "; + os << "data_dir=\"" << data_dir << "\", "; + os << "length_scale=" << length_scale << ")"; + + return os.str(); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h new file mode 100644 index 00000000..a4a68aca --- /dev/null +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h @@ -0,0 +1,44 @@ +// sherpa-onnx/csrc/offline-tts-kokoro-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ + +#include + +#include "sherpa-onnx/csrc/parse-options.h" + +namespace sherpa_onnx { + +struct OfflineTtsKokoroModelConfig { + std::string model; + std::string voices; + std::string tokens; + + std::string data_dir; + + // speed = 1 / length_scale + float length_scale = 1.0; + + OfflineTtsKokoroModelConfig() = default; + + OfflineTtsKokoroModelConfig(const std::string &model, + const std::string &voices, + const std::string &tokens, + const std::string &data_dir, float length_scale) + : model(model), + voices(voices), + tokens(tokens), + data_dir(data_dir), + length_scale(length_scale) {} + + void Register(ParseOptions *po); + bool Validate() const; + + std::string ToString() const; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h b/sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h new file mode 100644 index 00000000..64b70851 --- /dev/null +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h @@ -0,0 +1,25 @@ +// sherpa-onnx/csrc/offline-tts-kokoro-model-metadata.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_ + +#include +#include + +namespace sherpa_onnx { + +// please refer to +// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/add-meta-data.py +struct OfflineTtsKokoroModelMetaData { + int32_t sample_rate = 0; + int32_t num_speakers = 0; + int32_t version = 1; + int32_t has_espeak = 1; + int32_t max_token_len = 0; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_ diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model.cc b/sherpa-onnx/csrc/offline-tts-kokoro-model.cc new file mode 100644 index 00000000..7f7c9013 --- /dev/null +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model.cc @@ -0,0 +1,251 @@ +// sherpa-onnx/csrc/offline-tts-kokoro-model.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h" + +#include +#include +#include +#include + +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/session.h" +#include "sherpa-onnx/csrc/text-utils.h" + +namespace sherpa_onnx { + +class OfflineTtsKokoroModel::Impl { + public: + explicit Impl(const OfflineTtsModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto model_buf = ReadFile(config.kokoro.model); + auto voices_buf = ReadFile(config.kokoro.voices); + Init(model_buf.data(), model_buf.size(), voices_buf.data(), + voices_buf.size()); + } + + template + Impl(Manager *mgr, const OfflineTtsModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto model_buf = ReadFile(mgr, config.kokoro.model); + auto voices_buf = ReadFile(mgr, config.kokoro.voices); + Init(model_buf.data(), model_buf.size(), voices_buf.data(), + voices_buf.size()); + } + + const OfflineTtsKokoroModelMetaData &GetMetaData() const { + return meta_data_; + } + + Ort::Value Run(Ort::Value x, int32_t sid, float speed) { + auto memory_info = + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); + + std::vector x_shape = x.GetTensorTypeAndShapeInfo().GetShape(); + if (x_shape[0] != 1) { + SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d", + static_cast(x_shape[0])); + exit(-1); + } + + // there is a 0 at the front and end of x + int32_t len = static_cast(x_shape[1]) - 2; + int32_t num_speakers = meta_data_.num_speakers; + int32_t dim0 = style_dim_[0]; + int32_t dim1 = style_dim_[2]; + if (len >= dim0) { + SHERPA_ONNX_LOGE("Bad things happened! %d vs %d", len, dim0); + SHERPA_ONNX_EXIT(-1); + } + + /*const*/ float *p = styles_.data() + sid * dim0 * dim1 + len * dim1; + + std::array style_embedding_shape = {1, dim1}; + Ort::Value style_embedding = Ort::Value::CreateTensor( + memory_info, p, dim1, style_embedding_shape.data(), + style_embedding_shape.size()); + + int64_t speed_shape = 1; + + Ort::Value speed_tensor = + Ort::Value::CreateTensor(memory_info, &speed, 1, &speed_shape, 1); + + std::array inputs = { + std::move(x), std::move(style_embedding), std::move(speed_tensor)}; + + auto out = + sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), + output_names_ptr_.data(), output_names_ptr_.size()); + + return std::move(out[0]); + } + + private: + void Init(void *model_data, size_t model_data_length, const char *voices_data, + size_t voices_data_length) { + sess_ = std::make_unique(env_, model_data, model_data_length, + sess_opts_); + + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); + + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); + // get meta data + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); + if (config_.debug) { + std::ostringstream os; + os << "---kokoro model---\n"; + PrintModelMetadata(os, meta_data); + + os << "----------input names----------\n"; + int32_t i = 0; + for (const auto &s : input_names_) { + os << i << " " << s << "\n"; + ++i; + } + os << "----------output names----------\n"; + i = 0; + for (const auto &s : output_names_) { + os << i << " " << s << "\n"; + ++i; + } + +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); +#else + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); +#endif + } + + Ort::AllocatorWithDefaultOptions allocator; // used in the macro below + SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate"); + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); + SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); + SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); + + if (config_.debug) { + std::vector speaker_names; + SHERPA_ONNX_READ_META_DATA_VEC_STRING(speaker_names, "speaker_names"); + std::ostringstream os; + os << "\n"; + for (int32_t i = 0; i != speaker_names.size(); ++i) { + os << i << "->" << speaker_names[i] << ", "; + } + os << "\n"; + +#if __OHOS__ + SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str()); +#else + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); +#endif + } + + SHERPA_ONNX_READ_META_DATA_VEC(style_dim_, "style_dim"); + if (style_dim_.size() != 3) { + SHERPA_ONNX_LOGE("style_dim should be 3-d, given: %d", + static_cast(style_dim_.size())); + SHERPA_ONNX_EXIT(-1); + } + + if (style_dim_[1] != 1) { + SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[1]); + SHERPA_ONNX_EXIT(-1); + } + + int32_t actual_num_floats = voices_data_length / sizeof(float); + int32_t expected_num_floats = + style_dim_[0] * style_dim_[2] * meta_data_.num_speakers; + + if (actual_num_floats != expected_num_floats) { +#if __OHOS__ + SHERPA_ONNX_LOGE( + "Corrupted --kokoro-voices '%{public}s'. Expected #floats: " + "%{public}d, actual: %{public}d", + config_.kokoro.voices.c_str(), expected_num_floats, + actual_num_floats); +#else + SHERPA_ONNX_LOGE( + "Corrupted --kokoro-voices '%s'. Expected #floats: %d, actual: %d", + config_.kokoro.voices.c_str(), expected_num_floats, + actual_num_floats); +#endif + + SHERPA_ONNX_EXIT(-1); + } + + styles_ = std::vector( + reinterpret_cast(voices_data), + reinterpret_cast(voices_data) + expected_num_floats); + + meta_data_.max_token_len = style_dim_[0]; + } + + private: + OfflineTtsModelConfig config_; + Ort::Env env_; + Ort::SessionOptions sess_opts_; + Ort::AllocatorWithDefaultOptions allocator_; + + std::unique_ptr sess_; + + std::vector input_names_; + std::vector input_names_ptr_; + + std::vector output_names_; + std::vector output_names_ptr_; + + OfflineTtsKokoroModelMetaData meta_data_; + std::vector style_dim_; + + // (num_speakers, style_dim_[0], style_dim_[2]) + std::vector styles_; +}; + +OfflineTtsKokoroModel::OfflineTtsKokoroModel( + const OfflineTtsModelConfig &config) + : impl_(std::make_unique(config)) {} + +template +OfflineTtsKokoroModel::OfflineTtsKokoroModel( + Manager *mgr, const OfflineTtsModelConfig &config) + : impl_(std::make_unique(mgr, config)) {} + +OfflineTtsKokoroModel::~OfflineTtsKokoroModel() = default; + +const OfflineTtsKokoroModelMetaData &OfflineTtsKokoroModel::GetMetaData() + const { + return impl_->GetMetaData(); +} + +Ort::Value OfflineTtsKokoroModel::Run(Ort::Value x, int64_t sid /*= 0*/, + float speed /*= 1.0*/) const { + return impl_->Run(std::move(x), sid, speed); +} + +#if __ANDROID_API__ >= 9 +template OfflineTtsKokoroModel::OfflineTtsKokoroModel( + AAssetManager *mgr, const OfflineTtsModelConfig &config); +#endif + +#if __OHOS__ +template OfflineTtsKokoroModel::OfflineTtsKokoroModel( + NativeResourceManager *mgr, const OfflineTtsModelConfig &config); +#endif + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model.h b/sherpa-onnx/csrc/offline-tts-kokoro-model.h new file mode 100644 index 00000000..694f27f7 --- /dev/null +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model.h @@ -0,0 +1,39 @@ +// sherpa-onnx/csrc/offline-tts-kokoro-model.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_ + +#include +#include + +#include "onnxruntime_cxx_api.h" // NOLINT +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" +#include "sherpa-onnx/csrc/offline-tts-model-config.h" + +namespace sherpa_onnx { + +class OfflineTtsKokoroModel { + public: + ~OfflineTtsKokoroModel(); + + explicit OfflineTtsKokoroModel(const OfflineTtsModelConfig &config); + + template + OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config); + + // Return a float32 tensor containing the mel + // of shape (batch_size, mel_dim, num_frames) + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const; + + const OfflineTtsKokoroModelMetaData &GetMetaData() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_ diff --git a/sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h b/sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h similarity index 66% rename from sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h rename to sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h index c5cee946..06e91011 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h @@ -1,9 +1,9 @@ -// sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h +// sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h // // Copyright (c) 2023 Xiaomi Corporation -#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ -#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_ #include #include @@ -25,4 +25,4 @@ struct OfflineTtsMatchaModelMetaData { } // namespace sherpa_onnx -#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_ +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_ diff --git a/sherpa-onnx/csrc/offline-tts-matcha-model.h b/sherpa-onnx/csrc/offline-tts-matcha-model.h index 5b02ec9b..27ddaffd 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-model.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-model.h @@ -9,7 +9,7 @@ #include #include "onnxruntime_cxx_api.h" // NOLINT -#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" +#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h" #include "sherpa-onnx/csrc/offline-tts-model-config.h" namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/offline-tts-model-config.cc b/sherpa-onnx/csrc/offline-tts-model-config.cc index 4af179a4..d2153b7e 100644 --- a/sherpa-onnx/csrc/offline-tts-model-config.cc +++ b/sherpa-onnx/csrc/offline-tts-model-config.cc @@ -11,6 +11,7 @@ namespace sherpa_onnx { void OfflineTtsModelConfig::Register(ParseOptions *po) { vits.Register(po); matcha.Register(po); + kokoro.Register(po); po->Register("num-threads", &num_threads, "Number of threads to run the neural network"); @@ -32,7 +33,11 @@ bool OfflineTtsModelConfig::Validate() const { return vits.Validate(); } - return matcha.Validate(); + if (!matcha.acoustic_model.empty()) { + return matcha.Validate(); + } + + return kokoro.Validate(); } std::string OfflineTtsModelConfig::ToString() const { @@ -41,6 +46,7 @@ std::string OfflineTtsModelConfig::ToString() const { os << "OfflineTtsModelConfig("; os << "vits=" << vits.ToString() << ", "; os << "matcha=" << matcha.ToString() << ", "; + os << "kokoro=" << kokoro.ToString() << ", "; os << "num_threads=" << num_threads << ", "; os << "debug=" << (debug ? "True" : "False") << ", "; os << "provider=\"" << provider << "\")"; diff --git a/sherpa-onnx/csrc/offline-tts-model-config.h b/sherpa-onnx/csrc/offline-tts-model-config.h index 23268696..ce07cbd9 100644 --- a/sherpa-onnx/csrc/offline-tts-model-config.h +++ b/sherpa-onnx/csrc/offline-tts-model-config.h @@ -7,6 +7,7 @@ #include +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" #include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h" #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" #include "sherpa-onnx/csrc/parse-options.h" @@ -16,6 +17,7 @@ namespace sherpa_onnx { struct OfflineTtsModelConfig { OfflineTtsVitsModelConfig vits; OfflineTtsMatchaModelConfig matcha; + OfflineTtsKokoroModelConfig kokoro; int32_t num_threads = 1; bool debug = false; @@ -25,10 +27,12 @@ struct OfflineTtsModelConfig { OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits, const OfflineTtsMatchaModelConfig &matcha, + const OfflineTtsKokoroModelConfig &kokoro, int32_t num_threads, bool debug, const std::string &provider) : vits(vits), matcha(matcha), + kokoro(kokoro), num_threads(num_threads), debug(debug), provider(provider) {} diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h b/sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h similarity index 80% rename from sherpa-onnx/csrc/offline-tts-vits-model-metadata.h rename to sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h index 5ce00d74..3019d17d 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h @@ -1,9 +1,9 @@ -// sherpa-onnx/csrc/offline-tts-vits-model-metadata.h +// sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h // // Copyright (c) 2023 Xiaomi Corporation -#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ -#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_ #include #include @@ -46,4 +46,4 @@ struct OfflineTtsVitsModelMetaData { } // namespace sherpa_onnx -#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_ +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_ diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.h b/sherpa-onnx/csrc/offline-tts-vits-model.h index 30e4205d..90803005 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model.h @@ -10,7 +10,7 @@ #include "onnxruntime_cxx_api.h" // NOLINT #include "sherpa-onnx/csrc/offline-tts-model-config.h" -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index 98226000..ec312d8d 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -155,6 +155,36 @@ static std::vector PiperPhonemesToIdsMatcha( return ans; } +static std::vector> PiperPhonemesToIdsKokoro( + const std::unordered_map &token2id, + const std::vector &phonemes, int32_t max_len) { + std::vector> ans; + + std::vector current; + current.reserve(phonemes.size()); + + for (auto p : phonemes) { + if (token2id.count(p)) { + if (current.size() > max_len - 1) { + current.push_back(0); + ans.push_back(std::move(current)); + + current.reserve(phonemes.size()); + current.push_back(0); + } + + current.push_back(token2id.at(p)); + } else { + SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.", + static_cast(p)); + } + } + + current.push_back(0); + ans.push_back(std::move(current)); + return ans; +} + static std::vector CoquiPhonemesToIds( const std::unordered_map &token2id, const std::vector &phonemes, @@ -269,6 +299,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( InitEspeak(data_dir); } +PiperPhonemizeLexicon::PiperPhonemizeLexicon( + const std::string &tokens, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &kokoro_meta_data) + : kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) { + { + std::ifstream is(tokens); + token2id_ = ReadTokens(is); + } + + InitEspeak(data_dir); +} + template PiperPhonemizeLexicon::PiperPhonemizeLexicon( Manager *mgr, const std::string &tokens, const std::string &data_dir, @@ -286,10 +328,29 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( InitEspeak(data_dir); } +template +PiperPhonemizeLexicon::PiperPhonemizeLexicon( + Manager *mgr, const std::string &tokens, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &kokoro_meta_data) + : kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) { + { + auto buf = ReadFile(mgr, tokens); + std::istrstream is(buf.data(), buf.size()); + token2id_ = ReadTokens(is); + } + + // We should copy the directory of espeak-ng-data from the asset to + // some internal or external storage and then pass the directory to + // data_dir. + InitEspeak(data_dir); +} + std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( const std::string &text, const std::string &voice /*= ""*/) const { if (is_matcha_) { return ConvertTextToTokenIdsMatcha(text, voice); + } else if (is_kokoro_) { + return ConvertTextToTokenIdsKokoro(text, voice); } else { return ConvertTextToTokenIdsVits(text, voice); } @@ -320,6 +381,32 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( return ans; } +std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro( + const std::string &text, const std::string &voice /*= ""*/) const { + piper::eSpeakPhonemeConfig config; + + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices + // to list available voices + config.voice = voice; // e.g., voice is en-us + + std::vector> phonemes; + + CallPhonemizeEspeak(text, config, &phonemes); + + std::vector ans; + + for (const auto &p : phonemes) { + auto phoneme_ids = + PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len); + + for (auto &ids : phoneme_ids) { + ans.emplace_back(std::move(ids)); + } + } + + return ans; +} + std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsVits( const std::string &text, const std::string &voice /*= ""*/) const { piper::eSpeakPhonemeConfig config; @@ -363,6 +450,10 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( template PiperPhonemizeLexicon::PiperPhonemizeLexicon( AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, const OfflineTtsMatchaModelMetaData &matcha_meta_data); + +template PiperPhonemizeLexicon::PiperPhonemizeLexicon( + AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); #endif #if __OHOS__ @@ -375,6 +466,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( NativeResourceManager *mgr, const std::string &tokens, const std::string &data_dir, const OfflineTtsMatchaModelMetaData &matcha_meta_data); + +template PiperPhonemizeLexicon::PiperPhonemizeLexicon( + NativeResourceManager *mgr, const std::string &tokens, + const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); #endif } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.h b/sherpa-onnx/csrc/piper-phonemize-lexicon.h index f703f0b8..bb8c6e30 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.h +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.h @@ -10,8 +10,9 @@ #include #include "sherpa-onnx/csrc/offline-tts-frontend.h" -#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" -#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" +#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h" +#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h" namespace sherpa_onnx { @@ -23,6 +24,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, const OfflineTtsMatchaModelMetaData &matcha_meta_data); + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); + template PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, const std::string &data_dir, @@ -33,6 +37,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { const std::string &data_dir, const OfflineTtsMatchaModelMetaData &matcha_meta_data); + template + PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, + const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &kokoro_meta_data); + std::vector ConvertTextToTokenIds( const std::string &text, const std::string &voice = "") const override; @@ -43,12 +52,17 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend { std::vector ConvertTextToTokenIdsMatcha( const std::string &text, const std::string &voice = "") const; + std::vector ConvertTextToTokenIdsKokoro( + const std::string &text, const std::string &voice = "") const; + private: // map unicode codepoint to an integer ID std::unordered_map token2id_; OfflineTtsVitsModelMetaData vits_meta_data_; OfflineTtsMatchaModelMetaData matcha_meta_data_; + OfflineTtsKokoroModelMetaData kokoro_meta_data_; bool is_matcha_ = false; + bool is_kokoro_ = false; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/CMakeLists.txt b/sherpa-onnx/python/csrc/CMakeLists.txt index 38d32de5..a4c15713 100644 --- a/sherpa-onnx/python/csrc/CMakeLists.txt +++ b/sherpa-onnx/python/csrc/CMakeLists.txt @@ -54,6 +54,7 @@ endif() if(SHERPA_ONNX_ENABLE_TTS) list(APPEND srcs + offline-tts-kokoro-model-config.cc offline-tts-matcha-model-config.cc offline-tts-model-config.cc offline-tts-vits-model-config.cc diff --git a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc new file mode 100644 index 00000000..fbb24db5 --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc @@ -0,0 +1,31 @@ +// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h" + +#include + +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h" + +namespace sherpa_onnx { + +void PybindOfflineTtsKokoroModelConfig(py::module *m) { + using PyClass = OfflineTtsKokoroModelConfig; + + py::class_(*m, "OfflineTtsKokoroModelConfig") + .def(py::init<>()) + .def(py::init(), + py::arg("model"), py::arg("voices"), py::arg("tokens"), + py::arg("data_dir"), py::arg("length_scale") = 1.0) + .def_readwrite("model", &PyClass::model) + .def_readwrite("voices", &PyClass::voices) + .def_readwrite("tokens", &PyClass::tokens) + .def_readwrite("data_dir", &PyClass::data_dir) + .def_readwrite("length_scale", &PyClass::length_scale) + .def("__str__", &PyClass::ToString) + .def("validate", &PyClass::Validate); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h new file mode 100644 index 00000000..cc5f517a --- /dev/null +++ b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h @@ -0,0 +1,16 @@ +// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ +#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ + +#include "sherpa-onnx/python/csrc/sherpa-onnx.h" + +namespace sherpa_onnx { + +void PybindOfflineTtsKokoroModelConfig(py::module *m); + +} + +#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_ diff --git a/sherpa-onnx/python/csrc/offline-tts-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-model-config.cc index ed6a6e09..99769957 100644 --- a/sherpa-onnx/python/csrc/offline-tts-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-tts-model-config.cc @@ -7,6 +7,7 @@ #include #include "sherpa-onnx/csrc/offline-tts-model-config.h" +#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h" #include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h" #include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h" @@ -15,20 +16,24 @@ namespace sherpa_onnx { void PybindOfflineTtsModelConfig(py::module *m) { PybindOfflineTtsVitsModelConfig(m); PybindOfflineTtsMatchaModelConfig(m); + PybindOfflineTtsKokoroModelConfig(m); using PyClass = OfflineTtsModelConfig; py::class_(*m, "OfflineTtsModelConfig") .def(py::init<>()) .def(py::init(), py::arg("vits") = OfflineTtsVitsModelConfig{}, py::arg("matcha") = OfflineTtsMatchaModelConfig{}, + py::arg("kokoro") = OfflineTtsKokoroModelConfig{}, py::arg("num_threads") = 1, py::arg("debug") = false, py::arg("provider") = "cpu") .def_readwrite("vits", &PyClass::vits) .def_readwrite("matcha", &PyClass::matcha) + .def_readwrite("kokoro", &PyClass::kokoro) .def_readwrite("num_threads", &PyClass::num_threads) .def_readwrite("debug", &PyClass::debug) .def_readwrite("provider", &PyClass::provider) diff --git a/sherpa-onnx/python/sherpa_onnx/__init__.py b/sherpa-onnx/python/sherpa_onnx/__init__.py index 330c8d2d..5eeeffa5 100644 --- a/sherpa-onnx/python/sherpa_onnx/__init__.py +++ b/sherpa-onnx/python/sherpa_onnx/__init__.py @@ -20,6 +20,7 @@ from _sherpa_onnx import ( OfflineStream, OfflineTts, OfflineTtsConfig, + OfflineTtsKokoroModelConfig, OfflineTtsMatchaModelConfig, OfflineTtsModelConfig, OfflineTtsVitsModelConfig,