From c84a8338635c8611826e99d3c27afb3dbcc71a6f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 6 Feb 2025 22:57:13 +0800 Subject: [PATCH] Add C++ and Python API for Kokoro 1.0 multilingual TTS model (#1795) --- .github/scripts/test-python.sh | 24 +- .github/workflows/export-kokoro.yaml | 16 +- .gitignore | 1 + c-api-examples/kws-c-api.c | 13 +- cxx-api-examples/kws-cxx-api.cc | 13 +- python-api-examples/offline-tts-play.py | 51 +- python-api-examples/offline-tts.py | 51 +- scripts/kokoro/v1.0/generate_voices_bin.py | 2 +- sherpa-onnx/csrc/CMakeLists.txt | 1 + sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc | 522 ++++++++++++++++++ sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h | 45 ++ sherpa-onnx/csrc/melo-tts-lexicon.cc | 2 + sherpa-onnx/csrc/melo-tts-lexicon.h | 1 - sherpa-onnx/csrc/offline-tts-frontend.h | 6 + sherpa-onnx/csrc/offline-tts-kokoro-impl.h | 42 ++ .../csrc/offline-tts-kokoro-model-config.cc | 39 ++ .../csrc/offline-tts-kokoro-model-config.h | 12 +- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 9 +- sherpa-onnx/jni/offline-tts.cc | 2 - .../csrc/offline-tts-kokoro-model-config.cc | 6 +- 20 files changed, 819 insertions(+), 39 deletions(-) create mode 100644 sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc create mode 100644 sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index 39e6577a..dd4da512 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -267,6 +267,27 @@ log "Offline TTS test" # test waves are saved in ./tts mkdir ./tts +log "kokoro-multi-lang-v1_0 test" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 +tar xf kokoro-multi-lang-v1_0.tar.bz2 +rm kokoro-multi-lang-v1_0.tar.bz2 + +python3 ./python-api-examples/offline-tts.py \ + --debug=1 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=18 \ + --output-filename="./tts/kokoro-18-zh-en.wav" \ + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?" + +rm -rf kokoro-multi-lang-v1_0 + log "kokoro-en-v0_19 test" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 @@ -580,13 +601,10 @@ if [[ x$OS != x'windows-latest' ]]; then repo=sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 log "Start testing ${repo}" - pushd $dir curl -LS -O https://github.com/pkufool/keyword-spotting-models/releases/download/v0.1/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz - popd - repo=$dir/$repo ls -lh $repo python3 ./python-api-examples/keyword-spotter.py diff --git a/.github/workflows/export-kokoro.yaml b/.github/workflows/export-kokoro.yaml index adbc8bb1..53b5120c 100644 --- a/.github/workflows/export-kokoro.yaml +++ b/.github/workflows/export-kokoro.yaml @@ -4,7 +4,6 @@ on: push: branches: - export-kokoro - - kokoro-1.0-2 workflow_dispatch: @@ -76,6 +75,14 @@ jobs: if: matrix.version == '1.0' shell: bash run: | + curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2 + tar xvf dict.tar.bz2 + rm dict.tar.bz2 + + curl -SL -o date-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst + curl -SL -o number-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst + curl -SL -o phone-zh.fst https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst + src=scripts/kokoro/v1.0 d=kokoro-multi-lang-v1_0 @@ -87,7 +94,12 @@ jobs: cp -v $src/tokens.txt $d/ cp -v $src/lexicon*.txt $d/ cp -v $src/README.md $d/README.md + cp -av dict $d/ + cp -v ./*.fst $d/ ls -lh $d/ + echo "---" + ls -lh $d/dict + tar cjfv $d.tar.bz2 $d rm -rf $d @@ -180,6 +192,8 @@ jobs: cp -v ../scripts/kokoro/v1.0/lexicon*.txt . cp -v ../scripts/kokoro/v1.0/README.md ./README.md cp -v ../LICENSE ./ + cp -av ../dict ./ + cp -v ../*.fst $d/ git lfs track "*.onnx" git add . diff --git a/.gitignore b/.gitignore index 2840d8ff..ea356b06 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,4 @@ kokoro-en-v0_19 lexicon.txt us_gold.json us_silver.json +kokoro-multi-lang-v1_0 diff --git a/c-api-examples/kws-c-api.c b/c-api-examples/kws-c-api.c index 3ac42758..ecd70ccf 100644 --- a/c-api-examples/kws-c-api.c +++ b/c-api-examples/kws-c-api.c @@ -25,27 +25,28 @@ int32_t main() { memset(&config, 0, sizeof(config)); config.model_config.transducer.encoder = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; config.model_config.transducer.decoder = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; config.model_config.transducer.joiner = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; config.model_config.tokens = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt"; + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" + "tokens.txt"; config.model_config.provider = "cpu"; config.model_config.num_threads = 1; config.model_config.debug = 1; config.keywords_file = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/" - "test_keywords.txt"; + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" + "test_wavs/test_keywords.txt"; const SherpaOnnxKeywordSpotter *kws = SherpaOnnxCreateKeywordSpotter(&config); if (!kws) { diff --git a/cxx-api-examples/kws-cxx-api.cc b/cxx-api-examples/kws-cxx-api.cc index cdcb86ba..44f73438 100644 --- a/cxx-api-examples/kws-cxx-api.cc +++ b/cxx-api-examples/kws-cxx-api.cc @@ -24,27 +24,28 @@ int32_t main() { KeywordSpotterConfig config; config.model_config.transducer.encoder = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" "encoder-epoch-12-avg-2-chunk-16-left-64.onnx"; config.model_config.transducer.decoder = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" "decoder-epoch-12-avg-2-chunk-16-left-64.onnx"; config.model_config.transducer.joiner = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/" + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" "joiner-epoch-12-avg-2-chunk-16-left-64.onnx"; config.model_config.tokens = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt"; + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" + "tokens.txt"; config.model_config.provider = "cpu"; config.model_config.num_threads = 1; config.model_config.debug = 1; config.keywords_file = - "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/" - "test_keywords.txt"; + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01-mobile/" + "test_wavs/test_keywords.txt"; KeywordSpotter kws = KeywordSpotter::Create(config); if (!kws.Get()) { diff --git a/python-api-examples/offline-tts-play.py b/python-api-examples/offline-tts-play.py index 5ece997b..8ec419ac 100755 --- a/python-api-examples/offline-tts-play.py +++ b/python-api-examples/offline-tts-play.py @@ -11,7 +11,7 @@ while the model is still generating. Usage: -Example (1/6) +Example (1/7) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./generated.wav \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (2/6) +Example (2/7) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 tar xvf vits-zh-aishell3.tar.bz2 @@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./liubei-21.wav \ "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" -Example (3/6) +Example (3/7) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 @@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./test-2.wav \ "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" -Example (4/6) +Example (4/7) curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 @@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \ --output-filename=./test-matcha.wav \ "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" -Example (5/6) +Example (5/7) curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 @@ -88,7 +88,9 @@ python3 ./python-api-examples/offline-tts-play.py \ --num-threads=2 \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (6/6) +Example (6/7) + +(This version of kokoro supports only English) curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 tar xf kokoro-en-v0_19.tar.bz2 @@ -105,6 +107,27 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename="./kokoro-10.wav" \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." +Example (7/7) + +(This version of kokoro supports English, Chinese, etc.) + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 +tar xf kokoro-multi-lang-v1_0.tar.bz2 +rm kokoro-multi-lang-v1_0.tar.bz2 + +python3 ./python-api-examples/offline-tts-play.py \ + --debug=1 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=18 \ + --output-filename="./kokoro-18-zh-en.wav" \ + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?" + You can find more models at https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models @@ -247,6 +270,20 @@ def add_kokoro_args(parser): help="Path to the dict directory of espeak-ng.", ) + parser.add_argument( + "--kokoro-dict-dir", + type=str, + default="", + help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro", + ) + + parser.add_argument( + "--kokoro-lexicon", + type=str, + default="", + help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro", + ) + def get_args(): parser = argparse.ArgumentParser( @@ -459,6 +496,8 @@ def main(): voices=args.kokoro_voices, tokens=args.kokoro_tokens, data_dir=args.kokoro_data_dir, + dict_dir=args.kokoro_dict_dir, + lexicon=args.kokoro_lexicon, ), provider=args.provider, debug=args.debug, diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py index aace840f..c4e63b4f 100755 --- a/python-api-examples/offline-tts.py +++ b/python-api-examples/offline-tts.py @@ -12,7 +12,7 @@ generated audio. Usage: -Example (1/6) +Example (1/7) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./generated.wav \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (2/6) +Example (2/7) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2 @@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./liubei-21.wav \ "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" -Example (3/6) +Example (3/7) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 @@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./test-2.wav \ "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" -Example (4/6) +Example (4/7) curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 @@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename=./test-matcha.wav \ "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" -Example (5/6) +Example (5/7) curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 @@ -89,7 +89,9 @@ python3 ./python-api-examples/offline-tts.py \ --num-threads=2 \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (6/6) +Example (6/7) + +(This version of kokoro supports only English) curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2 tar xf kokoro-en-v0_19.tar.bz2 @@ -106,6 +108,27 @@ python3 ./python-api-examples/offline-tts.py \ --output-filename="./kokoro-10.wav" \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar." +Example (7/7) + +(This version of kokoro supports English, Chinese, etc.) + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-multi-lang-v1_0.tar.bz2 +tar xf kokoro-multi-lang-v1_0.tar.bz2 +rm kokoro-multi-lang-v1_0.tar.bz2 + +python3 ./python-api-examples/offline-tts.py \ + --debug=1 \ + --kokoro-model=./kokoro-multi-lang-v1_0/model.onnx \ + --kokoro-voices=./kokoro-multi-lang-v1_0/voices.bin \ + --kokoro-tokens=./kokoro-multi-lang-v1_0/tokens.txt \ + --kokoro-data-dir=./kokoro-multi-lang-v1_0/espeak-ng-data \ + --kokoro-dict-dir=./kokoro-multi-lang-v1_0/dict \ + --kokoro-lexicon=./kokoro-multi-lang-v1_0/lexicon-us-en.txt,./kokoro-multi-lang-v1_0/lexicon-zh.txt \ + --num-threads=2 \ + --sid=18 \ + --output-filename="./kokoro-18-zh-en.wav" \ + "中英文语音合成测试。This is generated by next generation Kaldi using Kokoro without Misaki. 你觉得中英文说的如何呢?" + You can find more models at https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models @@ -234,6 +257,20 @@ def add_kokoro_args(parser): help="Path to the dict directory of espeak-ng.", ) + parser.add_argument( + "--kokoro-dict-dir", + type=str, + default="", + help="Path to the dict directory for models using jieba. Needed only by multilingual kokoro", + ) + + parser.add_argument( + "--kokoro-lexicon", + type=str, + default="", + help="Path to lexicon.txt for kokoro. Needed only by multilingual kokoro", + ) + def get_args(): parser = argparse.ArgumentParser( @@ -342,6 +379,8 @@ def main(): voices=args.kokoro_voices, tokens=args.kokoro_tokens, data_dir=args.kokoro_data_dir, + dict_dir=args.kokoro_dict_dir, + lexicon=args.kokoro_lexicon, ), provider=args.provider, debug=args.debug, diff --git a/scripts/kokoro/v1.0/generate_voices_bin.py b/scripts/kokoro/v1.0/generate_voices_bin.py index 84d1d20d..c89ce243 100755 --- a/scripts/kokoro/v1.0/generate_voices_bin.py +++ b/scripts/kokoro/v1.0/generate_voices_bin.py @@ -71,7 +71,7 @@ def main(): with open("voices.bin", "wb") as f: for _, speaker in id2speaker.items(): m = torch.load( - f"{speaker}.pt", + f"voices/{speaker}.pt", weights_only=True, map_location="cpu", ).numpy() diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index d5303b75..4976f58f 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -153,6 +153,7 @@ if(SHERPA_ONNX_ENABLE_TTS) list(APPEND sources hifigan-vocoder.cc jieba-lexicon.cc + kokoro-multi-lang-lexicon.cc lexicon.cc melo-tts-lexicon.cc offline-tts-character-frontend.cc diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc new file mode 100644 index 00000000..1dab60c4 --- /dev/null +++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc @@ -0,0 +1,522 @@ +// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc +// +// Copyright (c) 2025 Xiaomi Corporation + +#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" + +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include + +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + +#include "cppjieba/Jieba.hpp" +#include "espeak-ng/speak_lib.h" +#include "phoneme_ids.hpp" +#include "phonemize.hpp" +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/symbol-table.h" +#include "sherpa-onnx/csrc/text-utils.h" + +namespace sherpa_onnx { + +void CallPhonemizeEspeak(const std::string &text, + piper::eSpeakPhonemeConfig &config, // NOLINT + std::vector> *phonemes); + +static std::wstring ToWideString(const std::string &s) { + // see + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t + std::wstring_convert> converter; + return converter.from_bytes(s); +} + +static std::string ToString(const std::wstring &s) { + // see + // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t + std::wstring_convert> converter; + return converter.to_bytes(s); +} + +class KokoroMultiLangLexicon::Impl { + public: + Impl(const std::string &tokens, const std::string &lexicon, + const std::string &dict_dir, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) + : meta_data_(meta_data), debug_(debug) { + InitTokens(tokens); + + InitLexicon(lexicon); + + InitJieba(dict_dir); + + InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc + } + + template + Impl(Manager *mgr, const std::string &tokens, const std::string &lexicon, + const std::string &dict_dir, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) + : meta_data_(meta_data), debug_(debug) { + InitTokens(mgr, tokens); + + InitLexicon(mgr, lexicon); + + // we assume you have copied dict_dir and data_dir from assets to some path + InitJieba(dict_dir); + + InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc + } + + std::vector ConvertTextToTokenIds(const std::string &_text) const { + std::string text = ToLowerCase(_text); + if (debug_) { + SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str()); + } + + std::vector> replace_str_pairs = { + {",", ","}, {":", ","}, {"、", ","}, {";", ";"}, {":", ":"}, + {"。", "."}, {"?", "?"}, {"!", "!"}, {"\\s+", " "}, + }; + for (const auto &p : replace_str_pairs) { + std::regex re(p.first); + text = std::regex_replace(text, re, p.second); + } + + if (debug_) { + SHERPA_ONNX_LOGE("After replacing punctuations and merging spaces:\n%s", + text.c_str()); + } + + // https://en.cppreference.com/w/cpp/regex + // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex + std::string expr = + "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)"; + + auto ws = ToWideString(text); + std::wstring wexpr = ToWideString(expr); + std::wregex we(wexpr); + + auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we); + auto end = std::wsregex_iterator(); + + std::vector ans; + + for (std::wsregex_iterator i = begin; i != end; ++i) { + std::wsmatch match = *i; + std::wstring match_str = match.str(); + auto ms = ToString(match_str); + uint8_t c = reinterpret_cast(ms.data())[0]; + + std::vector> ids_vec; + + if (c < 0x80) { + if (debug_) { + SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str()); + } + ids_vec = ConvertEnglishToTokenIDs(ms); + } else { + if (debug_) { + SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str()); + } + ids_vec = ConvertChineseToTokenIDs(ms); + } + + for (const auto &ids : ids_vec) { + if (ids.size() > 4) { + ans.emplace_back(ids); + } else { + if (ans.empty()) { + ans.emplace_back(ids); + } else { + ans.back().tokens.back() = ids[1]; + ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2, + ids.end()); + } + } + } + } + + if (debug_) { + for (const auto &v : ans) { + std::ostringstream os; + os << "\n"; + std::string sep; + for (auto i : v.tokens) { + os << sep << i; + sep = " "; + } + os << "\n"; + SHERPA_ONNX_LOGE("%s", os.str().c_str()); + } + } + + return ans; + } + + private: + bool IsPunctuation(const std::string &text) const { + if (text == ";" || text == ":" || text == "," || text == "." || + text == "!" || text == "?" || text == "—" || text == "…" || + text == "\"" || text == "(" || text == ")" || text == "“" || + text == "”") { + return true; + } + + return false; + } + + std::vector ConvertWordToIds(const std::string &w) const { + std::vector ans; + if (word2ids_.count(w)) { + ans = word2ids_.at(w); + return ans; + } + + std::vector words = SplitUtf8(w); + for (const auto &word : words) { + if (word2ids_.count(word)) { + auto ids = ConvertWordToIds(word); + ans.insert(ans.end(), ids.begin(), ids.end()); + } else { + SHERPA_ONNX_LOGE("Skip OOV: '%s'", word.c_str()); + } + } + + return ans; + } + + std::vector> ConvertChineseToTokenIDs( + const std::string &text) const { + bool is_hmm = true; + + std::vector words; + jieba_->Cut(text, words, is_hmm); + if (debug_) { + std::ostringstream os; + os << "After jieba processing:\n"; + + std::string sep; + for (const auto &w : words) { + os << sep << w; + sep = "_"; + } + SHERPA_ONNX_LOGE("%s", os.str().c_str()); + } + + std::vector> ans; + std::vector this_sentence; + int32_t max_len = meta_data_.max_token_len; + + this_sentence.push_back(0); + for (const auto &w : words) { + auto ids = ConvertWordToIds(w); + if (this_sentence.size() + ids.size() > max_len - 2) { + this_sentence.push_back(0); + ans.push_back(std::move(this_sentence)); + + this_sentence.push_back(0); + } + + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); + } + + if (this_sentence.size() > 1) { + this_sentence.push_back(0); + ans.push_back(std::move(this_sentence)); + } + + if (debug_) { + for (const auto &v : ans) { + std::ostringstream os; + os << "\n"; + std::string sep; + for (auto i : v) { + os << sep << i; + sep = " "; + } + os << "\n"; + SHERPA_ONNX_LOGE("%s", os.str().c_str()); + } + } + + return ans; + } + + std::vector> ConvertEnglishToTokenIDs( + const std::string &text) const { + std::vector words = SplitUtf8(text); + if (debug_) { + std::ostringstream os; + os << "After splitting to words: "; + std::string sep; + for (const auto &w : words) { + os << sep << w; + sep = "_"; + } + SHERPA_ONNX_LOGE("%s", os.str().c_str()); + } + + std::vector> ans; + int32_t max_len = meta_data_.max_token_len; + std::vector this_sentence; + + int32_t space_id = token2id_.at(" "); + + this_sentence.push_back(0); + + for (const auto &word : words) { + if (IsPunctuation(word)) { + this_sentence.push_back(token2id_.at(word)); + + if (this_sentence.size() > max_len - 2) { + // this sentence is too long, split it + this_sentence.push_back(0); + ans.push_back(std::move(this_sentence)); + + this_sentence.push_back(0); + continue; + } + + if (word == "." || word == "!" || word == "?" || word == ";") { + // Note: You can add more punctuations here to split the text + // into sentences. We just use four here: .!?; + this_sentence.push_back(0); + ans.push_back(std::move(this_sentence)); + + this_sentence.push_back(0); + } + } else if (word2ids_.count(word)) { + const auto &ids = word2ids_.at(word); + if (this_sentence.size() + ids.size() + 3 > max_len - 2) { + this_sentence.push_back(0); + ans.push_back(std::move(this_sentence)); + + this_sentence.push_back(0); + } + + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); + this_sentence.push_back(space_id); + } else { + SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str()); + + piper::eSpeakPhonemeConfig config; + + config.voice = "en-us"; + + std::vector> phonemes; + + CallPhonemizeEspeak(word, config, &phonemes); + // Note phonemes[i] contains a vector of unicode codepoints; + // we need to convert them to utf8 + + std::wstring_convert, char32_t> conv; + + std::vector ids; + for (const auto &v : phonemes) { + for (const auto p : v) { + auto token = conv.to_bytes(p); + if (token2id_.count(token)) { + ids.push_back(token2id_.at(token)); + } else { + SHERPA_ONNX_LOGE("Skip OOV token '%s' from '%s'", token.c_str(), + word.c_str()); + } + } + } + + if (this_sentence.size() + ids.size() + 3 > max_len - 2) { + this_sentence.push_back(0); + ans.push_back(std::move(this_sentence)); + + this_sentence.push_back(0); + } + + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); + this_sentence.push_back(space_id); + } + } + + if (this_sentence.size() > 1) { + this_sentence.push_back(0); + ans.push_back(std::move(this_sentence)); + } + + if (debug_) { + for (const auto &v : ans) { + std::ostringstream os; + os << "\n"; + std::string sep; + for (auto i : v) { + os << sep << i; + sep = " "; + } + os << "\n"; + SHERPA_ONNX_LOGE("%s", os.str().c_str()); + } + } + + return ans; + } + + void InitTokens(const std::string &tokens) { + std::ifstream is(tokens); + InitTokens(is); + } + + template + void InitTokens(Manager *mgr, const std::string &tokens) { + auto buf = ReadFile(mgr, tokens); + + std::istrstream is(buf.data(), buf.size()); + InitTokens(is); + } + + void InitTokens(std::istream &is) { + token2id_ = ReadTokens(is); // defined in ./symbol-table.cc + } + + void InitLexicon(const std::string &lexicon) { + std::vector files; + SplitStringToVector(lexicon, ",", false, &files); + for (const auto &f : files) { + std::ifstream is(f); + InitLexicon(is); + } + } + + template + void InitLexicon(Manager *mgr, const std::string &lexicon) { + std::vector files; + SplitStringToVector(lexicon, ",", false, &files); + for (const auto &f : files) { + auto buf = ReadFile(mgr, f); + + std::istrstream is(buf.data(), buf.size()); + InitLexicon(is); + } + } + + void InitLexicon(std::istream &is) { + std::string word; + std::vector token_list; + std::string token; + + std::string line; + int32_t line_num = 0; + int32_t num_warn = 0; + while (std::getline(is, line)) { + ++line_num; + std::istringstream iss(line); + + token_list.clear(); + iss >> word; + ToLowerCase(&word); + + if (word2ids_.count(word)) { + num_warn += 1; + if (num_warn < 10) { + SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.", + word.c_str(), line_num, line.c_str()); + } + continue; + } + + while (iss >> token) { + token_list.push_back(std::move(token)); + } + + std::vector ids = ConvertTokensToIds(token2id_, token_list); + + if (ids.empty()) { + SHERPA_ONNX_LOGE( + "Invalid pronunciation for word '%s' at line %d:%s. Ignore it", + word.c_str(), line_num, line.c_str()); + continue; + } + + word2ids_.insert({std::move(word), std::move(ids)}); + } + } + + void InitJieba(const std::string &dict_dir) { + std::string dict = dict_dir + "/jieba.dict.utf8"; + std::string hmm = dict_dir + "/hmm_model.utf8"; + std::string user_dict = dict_dir + "/user.dict.utf8"; + std::string idf = dict_dir + "/idf.utf8"; + std::string stop_word = dict_dir + "/stop_words.utf8"; + + AssertFileExists(dict); + AssertFileExists(hmm); + AssertFileExists(user_dict); + AssertFileExists(idf); + AssertFileExists(stop_word); + + jieba_ = + std::make_unique(dict, hmm, user_dict, idf, stop_word); + } + + private: + OfflineTtsKokoroModelMetaData meta_data_; + + // word to token IDs + std::unordered_map> word2ids_; + + // tokens.txt is saved in token2id_ + std::unordered_map token2id_; + + std::unique_ptr jieba_; + bool debug_ = false; +}; + +KokoroMultiLangLexicon::~KokoroMultiLangLexicon() = default; + +KokoroMultiLangLexicon::KokoroMultiLangLexicon( + const std::string &tokens, const std::string &lexicon, + const std::string &dict_dir, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) + : impl_(std::make_unique(tokens, lexicon, dict_dir, data_dir, + meta_data, debug)) {} + +template +KokoroMultiLangLexicon::KokoroMultiLangLexicon( + Manager *mgr, const std::string &tokens, const std::string &lexicon, + const std::string &dict_dir, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &meta_data, bool debug) + : impl_(std::make_unique(mgr, tokens, lexicon, dict_dir, data_dir, + meta_data, debug)) {} + +std::vector KokoroMultiLangLexicon::ConvertTextToTokenIds( + const std::string &text, const std::string & /*unused_voice = ""*/) const { + return impl_->ConvertTextToTokenIds(text); +} + +#if __ANDROID_API__ >= 9 +template KokoroMultiLangLexicon::KokoroMultiLangLexicon( + AAssetManager *mgr, const std::string &tokens, const std::string &lexicon, + const std::string &dict_dir, const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &meta_data, bool debug); +#endif + +#if __OHOS__ +template KokoroMultiLangLexicon::KokoroMultiLangLexicon( + NativeResourceManager *mgr, const std::string &tokens, + const std::string &lexicon, const std::string &dict_dir, + const std::string &data_dir, const OfflineTtsKokoroModelMetaData &meta_data, + bool debug); +#endif + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h new file mode 100644 index 00000000..db066781 --- /dev/null +++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h @@ -0,0 +1,45 @@ +// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h +// +// Copyright (c) 2025 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_ +#define SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_ + +#include +#include +#include + +#include "sherpa-onnx/csrc/offline-tts-frontend.h" +#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h" + +namespace sherpa_onnx { + +class KokoroMultiLangLexicon : public OfflineTtsFrontend { + public: + ~KokoroMultiLangLexicon() override; + + KokoroMultiLangLexicon(const std::string &tokens, const std::string &lexicon, + const std::string &dict_dir, + const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &meta_data, + bool debug); + + template + KokoroMultiLangLexicon(Manager *mgr, const std::string &tokens, + const std::string &lexicon, + const std::string &dict_dir, + const std::string &data_dir, + const OfflineTtsKokoroModelMetaData &meta_data, + bool debug); + + std::vector ConvertTextToTokenIds( + const std::string &text, const std::string &voice = "") const override; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_KOKORO_MULTI_LANG_LEXICON_H_ diff --git a/sherpa-onnx/csrc/melo-tts-lexicon.cc b/sherpa-onnx/csrc/melo-tts-lexicon.cc index ec729cdb..48b854f8 100644 --- a/sherpa-onnx/csrc/melo-tts-lexicon.cc +++ b/sherpa-onnx/csrc/melo-tts-lexicon.cc @@ -6,7 +6,9 @@ #include #include // NOLINT +#include #include +#include #include #if __ANDROID_API__ >= 9 #include "android/asset_manager.h" diff --git a/sherpa-onnx/csrc/melo-tts-lexicon.h b/sherpa-onnx/csrc/melo-tts-lexicon.h index ef7dd029..96b68c7a 100644 --- a/sherpa-onnx/csrc/melo-tts-lexicon.h +++ b/sherpa-onnx/csrc/melo-tts-lexicon.h @@ -7,7 +7,6 @@ #include #include -#include #include #include "sherpa-onnx/csrc/offline-tts-frontend.h" diff --git a/sherpa-onnx/csrc/offline-tts-frontend.h b/sherpa-onnx/csrc/offline-tts-frontend.h index cba50e36..43c4501c 100644 --- a/sherpa-onnx/csrc/offline-tts-frontend.h +++ b/sherpa-onnx/csrc/offline-tts-frontend.h @@ -19,6 +19,9 @@ struct TokenIDs { /*implicit*/ TokenIDs(std::vector tokens) // NOLINT : tokens{std::move(tokens)} {} + /*implicit*/ TokenIDs(const std::vector &tokens) // NOLINT + : tokens{tokens.begin(), tokens.end()} {} + TokenIDs(std::vector tokens, // NOLINT std::vector tones) // NOLINT : tokens{std::move(tokens)}, tones{std::move(tones)} {} @@ -51,6 +54,9 @@ class OfflineTtsFrontend { const std::string &text, const std::string &voice = "") const = 0; }; +// implementation is in ./piper-phonemize-lexicon.cc +void InitEspeak(const std::string &data_dir); + } // namespace sherpa_onnx #endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h index 4c3efbf6..510f031c 100644 --- a/sherpa-onnx/csrc/offline-tts-kokoro-impl.h +++ b/sherpa-onnx/csrc/offline-tts-kokoro-impl.h @@ -13,6 +13,7 @@ #include "fst/extensions/far/far.h" #include "kaldifst/csrc/kaldi-fst-io.h" #include "kaldifst/csrc/text-normalizer.h" +#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h" #include "sherpa-onnx/csrc/lexicon.h" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-tts-frontend.h" @@ -314,6 +315,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { template void InitFrontend(Manager *mgr) { const auto &meta_data = model_->GetMetaData(); + + if (meta_data.version >= 2) { + // this is a multi-lingual model, we require that you pass lexicon + // and dict_dir + if (config_.model.kokoro.lexicon.empty() || + config_.model.kokoro.dict_dir.empty()) { + SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); + SHERPA_ONNX_LOGE( + "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " + "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir"); + SHERPA_ONNX_EXIT(-1); + } + + frontend_ = std::make_unique( + mgr, config_.model.kokoro.tokens, config_.model.kokoro.lexicon, + config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir, + meta_data, config_.model.debug); + + return; + } + frontend_ = std::make_unique( mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data); @@ -321,7 +343,27 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl { void InitFrontend() { const auto &meta_data = model_->GetMetaData(); + if (meta_data.version >= 2) { + // this is a multi-lingual model, we require that you pass lexicon + // and dict_dir + if (config_.model.kokoro.lexicon.empty() || + config_.model.kokoro.dict_dir.empty()) { + SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version); + SHERPA_ONNX_LOGE( + "You are using a multi-lingual Kokoro model (e.g., Kokoro >= " + "v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir"); + SHERPA_ONNX_EXIT(-1); + } + frontend_ = std::make_unique( + config_.model.kokoro.tokens, config_.model.kokoro.lexicon, + config_.model.kokoro.dict_dir, config_.model.kokoro.data_dir, + meta_data, config_.model.debug); + + return; + } + + // this is for kokoro v0.19, which supports only English frontend_ = std::make_unique( config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data); } diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc index 3eb5ad7e..59645060 100644 --- a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc @@ -8,6 +8,7 @@ #include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/text-utils.h" namespace sherpa_onnx { @@ -17,8 +18,16 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) { "Path to voices.bin for Kokoro models"); po->Register("kokoro-tokens", &tokens, "Path to tokens.txt for Kokoro models"); + po->Register( + "kokoro-lexicon", &lexicon, + "Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0" + "You can pass multiple files, separated by ','. Example: " + "./lexicon-us-en.txt,./lexicon-zh.txt"); po->Register("kokoro-data-dir", &data_dir, "Path to the directory containing dict for espeak-ng."); + po->Register("kokoro-dict-dir", &dict_dir, + "Path to the directory containing dict for jieba. " + "Used only for Kokoro >= v1.0"); po->Register("kokoro-length-scale", &length_scale, "Speech speed. Larger->Slower; Smaller->faster."); } @@ -44,6 +53,19 @@ bool OfflineTtsKokoroModelConfig::Validate() const { return false; } + if (!lexicon.empty()) { + std::vector files; + SplitStringToVector(lexicon, ",", false, &files); + for (const auto &f : files) { + if (!FileExists(f)) { + SHERPA_ONNX_LOGE( + "lexicon '%s' does not exist. Please re-check --kokoro-lexicon", + f.c_str()); + return false; + } + } + } + if (data_dir.empty()) { SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir"); return false; @@ -77,6 +99,21 @@ bool OfflineTtsKokoroModelConfig::Validate() const { return false; } + if (!dict_dir.empty()) { + std::vector required_files = { + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8", + "idf.utf8", "stop_words.utf8", + }; + + for (const auto &f : required_files) { + if (!FileExists(dict_dir + "/" + f)) { + SHERPA_ONNX_LOGE("'%s/%s' does not exist. Please check kokoro-dict-dir", + dict_dir.c_str(), f.c_str()); + return false; + } + } + } + return true; } @@ -87,7 +124,9 @@ std::string OfflineTtsKokoroModelConfig::ToString() const { os << "model=\"" << model << "\", "; os << "voices=\"" << voices << "\", "; os << "tokens=\"" << tokens << "\", "; + os << "lexicon=\"" << lexicon << "\", "; os << "data_dir=\"" << data_dir << "\", "; + os << "dict_dir=\"" << dict_dir << "\", "; os << "length_scale=" << length_scale << ")"; return os.str(); diff --git a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h index a4a68aca..fae17927 100644 --- a/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h +++ b/sherpa-onnx/csrc/offline-tts-kokoro-model-config.h @@ -16,8 +16,14 @@ struct OfflineTtsKokoroModelConfig { std::string voices; std::string tokens; + // Note: You can pass multiple files, separated by ",", to lexicon + // Example: lexicon = "./lexicon-gb-en.txt,./lexicon-zh.txt"; + std::string lexicon; + std::string data_dir; + std::string dict_dir; + // speed = 1 / length_scale float length_scale = 1.0; @@ -26,11 +32,15 @@ struct OfflineTtsKokoroModelConfig { OfflineTtsKokoroModelConfig(const std::string &model, const std::string &voices, const std::string &tokens, - const std::string &data_dir, float length_scale) + const std::string &lexicon, + const std::string &data_dir, + const std::string &dict_dir, float length_scale) : model(model), voices(voices), tokens(tokens), + lexicon(lexicon), data_dir(data_dir), + dict_dir(dict_dir), length_scale(length_scale) {} void Register(ParseOptions *po); diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index 70ca3775..608a1ccd 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -32,10 +32,9 @@ namespace sherpa_onnx { -static void CallPhonemizeEspeak( - const std::string &text, - piper::eSpeakPhonemeConfig &config, // NOLINT - std::vector> *phonemes) { +void CallPhonemizeEspeak(const std::string &text, + piper::eSpeakPhonemeConfig &config, // NOLINT + std::vector> *phonemes) { static std::mutex espeak_mutex; std::lock_guard lock(espeak_mutex); @@ -245,7 +244,7 @@ static std::vector CoquiPhonemesToIds( return ans; } -static void InitEspeak(const std::string &data_dir) { +void InitEspeak(const std::string &data_dir) { static std::once_flag init_flag; std::call_once(init_flag, [data_dir]() { int32_t result = diff --git a/sherpa-onnx/jni/offline-tts.cc b/sherpa-onnx/jni/offline-tts.cc index 6af10788..e80b90a5 100644 --- a/sherpa-onnx/jni/offline-tts.cc +++ b/sherpa-onnx/jni/offline-tts.cc @@ -241,7 +241,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateImpl(JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid, jfloat speed) { const char *p_text = env->GetStringUTFChars(text, nullptr); - SHERPA_ONNX_LOGE("string is: %s", p_text); auto audio = reinterpret_cast(ptr)->Generate( p_text, sid, speed); @@ -267,7 +266,6 @@ Java_com_k2fsa_sherpa_onnx_OfflineTts_generateWithCallbackImpl( JNIEnv *env, jobject /*obj*/, jlong ptr, jstring text, jint sid, jfloat speed, jobject callback) { const char *p_text = env->GetStringUTFChars(text, nullptr); - SHERPA_ONNX_LOGE("string is: %s", p_text); std::function callback_wrapper = [env, callback](const float *samples, int32_t n, diff --git a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc index fbb24db5..d9a00ca4 100644 --- a/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc @@ -16,13 +16,17 @@ void PybindOfflineTtsKokoroModelConfig(py::module *m) { py::class_(*m, "OfflineTtsKokoroModelConfig") .def(py::init<>()) .def(py::init(), py::arg("model"), py::arg("voices"), py::arg("tokens"), - py::arg("data_dir"), py::arg("length_scale") = 1.0) + py::arg("lexicon") = "", py::arg("data_dir"), + py::arg("dict_dir") = "", py::arg("length_scale") = 1.0) .def_readwrite("model", &PyClass::model) .def_readwrite("voices", &PyClass::voices) .def_readwrite("tokens", &PyClass::tokens) + .def_readwrite("lexicon", &PyClass::lexicon) .def_readwrite("data_dir", &PyClass::data_dir) + .def_readwrite("dict_dir", &PyClass::dict_dir) .def_readwrite("length_scale", &PyClass::length_scale) .def("__str__", &PyClass::ToString) .def("validate", &PyClass::Validate);