Add C++ and Python API for Kokoro TTS models. (#1715)
This commit is contained in:
25
.github/scripts/test-offline-tts.sh
vendored
25
.github/scripts/test-offline-tts.sh
vendored
@@ -18,6 +18,31 @@ which $EXE
|
||||
# test waves are saved in ./tts
|
||||
mkdir ./tts
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "kokoro-en-v0_19"
|
||||
log "------------------------------------------------------------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
tar xf kokoro-en-v0_19.tar.bz2
|
||||
rm kokoro-en-v0_19.tar.bz2
|
||||
|
||||
# mapping of sid to voice name
|
||||
# 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
|
||||
# 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis
|
||||
|
||||
for sid in $(seq 0 10); do
|
||||
$EXE \
|
||||
--debug=1 \
|
||||
--kokoro-model=./kokoro-en-v0_19/model.onnx \
|
||||
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
|
||||
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
|
||||
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
|
||||
--num-threads=2 \
|
||||
--sid=$sid \
|
||||
--output-filename="./tts/kokoro-$sid.wav" \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
||||
done
|
||||
rm -rf kokoro-en-v0_19
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "matcha-icefall-en_US-ljspeech"
|
||||
log "------------------------------------------------------------"
|
||||
|
||||
19
.github/scripts/test-python.sh
vendored
19
.github/scripts/test-python.sh
vendored
@@ -267,6 +267,25 @@ log "Offline TTS test"
|
||||
# test waves are saved in ./tts
|
||||
mkdir ./tts
|
||||
|
||||
log "kokoro-en-v0_19 test"
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
tar xf kokoro-en-v0_19.tar.bz2
|
||||
rm kokoro-en-v0_19.tar.bz2
|
||||
|
||||
python3 ./python-api-examples/offline-tts.py \
|
||||
--debug=1 \
|
||||
--kokoro-model=./kokoro-en-v0_19/model.onnx \
|
||||
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
|
||||
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
|
||||
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
|
||||
--num-threads=2 \
|
||||
--sid=10 \
|
||||
--output-filename="./tts/kokoro-10.wav" \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
rm -rf kokoro-en-v0_19
|
||||
|
||||
log "matcha-ljspeech-en test"
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
|
||||
@@ -11,7 +11,7 @@ while the model is still generating.
|
||||
|
||||
Usage:
|
||||
|
||||
Example (1/5)
|
||||
Example (1/6)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||
@@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./generated.wav \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
Example (2/5)
|
||||
Example (2/6)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
tar xvf vits-zh-aishell3.tar.bz2
|
||||
@@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./liubei-21.wav \
|
||||
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
||||
|
||||
Example (3/5)
|
||||
Example (3/6)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
@@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./test-2.wav \
|
||||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (4/5)
|
||||
Example (4/6)
|
||||
|
||||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||
@@ -71,7 +71,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./test-matcha.wav \
|
||||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (5/5)
|
||||
Example (5/6)
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
@@ -88,6 +88,22 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--num-threads=2 \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
Example (6/6)
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
tar xf kokoro-en-v0_19.tar.bz2
|
||||
rm kokoro-en-v0_19.tar.bz2
|
||||
|
||||
python3 ./python-api-examples/offline-tts.py \
|
||||
--debug=1 \
|
||||
--kokoro-model=./kokoro-en-v0_19/model.onnx \
|
||||
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
|
||||
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
|
||||
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
|
||||
--num-threads=2 \
|
||||
--sid=10 \
|
||||
--output-filename="./kokoro-10.wav" \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
You can find more models at
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
@@ -202,6 +218,36 @@ def add_matcha_args(parser):
|
||||
)
|
||||
|
||||
|
||||
def add_kokoro_args(parser):
|
||||
parser.add_argument(
|
||||
"--kokoro-model",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to model.onnx for kokoro",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kokoro-voices",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to voices.bin for kokoro",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kokoro-tokens",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to tokens.txt for kokoro",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kokoro-data-dir",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to the dict directory of espeak-ng.",
|
||||
)
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
@@ -209,6 +255,7 @@ def get_args():
|
||||
|
||||
add_vits_args(parser)
|
||||
add_matcha_args(parser)
|
||||
add_kokoro_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--tts-rule-fsts",
|
||||
@@ -407,6 +454,12 @@ def main():
|
||||
data_dir=args.matcha_data_dir,
|
||||
dict_dir=args.matcha_dict_dir,
|
||||
),
|
||||
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
|
||||
model=args.kokoro_model,
|
||||
voices=args.kokoro_voices,
|
||||
tokens=args.kokoro_tokens,
|
||||
data_dir=args.kokoro_data_dir,
|
||||
),
|
||||
provider=args.provider,
|
||||
debug=args.debug,
|
||||
num_threads=args.num_threads,
|
||||
|
||||
@@ -12,7 +12,7 @@ generated audio.
|
||||
|
||||
Usage:
|
||||
|
||||
Example (1/5)
|
||||
Example (1/6)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||
@@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./generated.wav \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
Example (2/5)
|
||||
Example (2/6)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||
@@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./liubei-21.wav \
|
||||
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
||||
|
||||
Example (3/5)
|
||||
Example (3/6)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
@@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./test-2.wav \
|
||||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (4/5)
|
||||
Example (4/6)
|
||||
|
||||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||
@@ -72,7 +72,7 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./test-matcha.wav \
|
||||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (5/5)
|
||||
Example (5/6)
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
@@ -89,6 +89,23 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--num-threads=2 \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
Example (6/6)
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
|
||||
tar xf kokoro-en-v0_19.tar.bz2
|
||||
rm kokoro-en-v0_19.tar.bz2
|
||||
|
||||
python3 ./python-api-examples/offline-tts.py \
|
||||
--debug=1 \
|
||||
--kokoro-model=./kokoro-en-v0_19/model.onnx \
|
||||
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
|
||||
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
|
||||
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
|
||||
--num-threads=2 \
|
||||
--sid=10 \
|
||||
--output-filename="./kokoro-10.wav" \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
You can find more models at
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
|
||||
@@ -188,6 +205,36 @@ def add_matcha_args(parser):
|
||||
)
|
||||
|
||||
|
||||
def add_kokoro_args(parser):
|
||||
parser.add_argument(
|
||||
"--kokoro-model",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to model.onnx for kokoro",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kokoro-voices",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to voices.bin for kokoro",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kokoro-tokens",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to tokens.txt for kokoro",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kokoro-data-dir",
|
||||
type=str,
|
||||
default="",
|
||||
help="Path to the dict directory of espeak-ng.",
|
||||
)
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
@@ -195,6 +242,7 @@ def get_args():
|
||||
|
||||
add_vits_args(parser)
|
||||
add_matcha_args(parser)
|
||||
add_kokoro_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--tts-rule-fsts",
|
||||
@@ -206,7 +254,7 @@ def get_args():
|
||||
parser.add_argument(
|
||||
"--max-num-sentences",
|
||||
type=int,
|
||||
default=2,
|
||||
default=1,
|
||||
help="""Max number of sentences in a batch to avoid OOM if the input
|
||||
text is very long. Set it to -1 to process all the sentences in a
|
||||
single batch. A smaller value does not mean it is slower compared
|
||||
@@ -289,6 +337,12 @@ def main():
|
||||
data_dir=args.matcha_data_dir,
|
||||
dict_dir=args.matcha_dict_dir,
|
||||
),
|
||||
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
|
||||
model=args.kokoro_model,
|
||||
voices=args.kokoro_voices,
|
||||
tokens=args.kokoro_tokens,
|
||||
data_dir=args.kokoro_data_dir,
|
||||
),
|
||||
provider=args.provider,
|
||||
debug=args.debug,
|
||||
num_threads=args.num_threads,
|
||||
|
||||
@@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS)
|
||||
offline-tts-character-frontend.cc
|
||||
offline-tts-frontend.cc
|
||||
offline-tts-impl.cc
|
||||
offline-tts-kokoro-model-config.cc
|
||||
offline-tts-kokoro-model.cc
|
||||
offline-tts-matcha-model-config.cc
|
||||
offline-tts-matcha-model.cc
|
||||
offline-tts-model-config.cc
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "rawfile/raw_file_manager.h"
|
||||
#endif
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-impl.h"
|
||||
|
||||
@@ -37,8 +38,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
|
||||
const OfflineTtsConfig &config) {
|
||||
if (!config.model.vits.model.empty()) {
|
||||
return std::make_unique<OfflineTtsVitsImpl>(config);
|
||||
} else if (!config.model.matcha.acoustic_model.empty()) {
|
||||
return std::make_unique<OfflineTtsMatchaImpl>(config);
|
||||
}
|
||||
return std::make_unique<OfflineTtsMatchaImpl>(config);
|
||||
|
||||
return std::make_unique<OfflineTtsKokoroImpl>(config);
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
@@ -46,9 +50,11 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
|
||||
Manager *mgr, const OfflineTtsConfig &config) {
|
||||
if (!config.model.vits.model.empty()) {
|
||||
return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
|
||||
} else if (!config.model.matcha.acoustic_model.empty()) {
|
||||
return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
|
||||
}
|
||||
|
||||
return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
|
||||
return std::make_unique<OfflineTtsKokoroImpl>(mgr, config);
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
|
||||
376
sherpa-onnx/csrc/offline-tts-kokoro-impl.h
Normal file
376
sherpa-onnx/csrc/offline-tts-kokoro-impl.h
Normal file
@@ -0,0 +1,376 @@
|
||||
// sherpa-onnx/csrc/offline-tts-kokoro-impl.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <strstream>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "fst/extensions/far/far.h"
|
||||
#include "kaldifst/csrc/kaldi-fst-io.h"
|
||||
#include "kaldifst/csrc/text-normalizer.h"
|
||||
#include "sherpa-onnx/csrc/lexicon.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-impl.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
public:
|
||||
explicit OfflineTtsKokoroImpl(const OfflineTtsConfig &config)
|
||||
: config_(config),
|
||||
model_(std::make_unique<OfflineTtsKokoroModel>(config.model)) {
|
||||
InitFrontend();
|
||||
|
||||
if (!config.rule_fsts.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fsts, ",", false, &files);
|
||||
tn_list_.reserve(files.size());
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
|
||||
}
|
||||
}
|
||||
|
||||
if (!config.rule_fars.empty()) {
|
||||
if (config.model.debug) {
|
||||
SHERPA_ONNX_LOGE("Loading FST archives");
|
||||
}
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fars, ",", false, &files);
|
||||
|
||||
tn_list_.reserve(files.size() + tn_list_.size());
|
||||
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
|
||||
fst::FarReader<fst::StdArc>::Open(f));
|
||||
for (; !reader->Done(); reader->Next()) {
|
||||
std::unique_ptr<fst::StdConstFst> r(
|
||||
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
|
||||
|
||||
tn_list_.push_back(
|
||||
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
|
||||
}
|
||||
}
|
||||
|
||||
if (config.model.debug) {
|
||||
SHERPA_ONNX_LOGE("FST archives loaded!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
OfflineTtsKokoroImpl(Manager *mgr, const OfflineTtsConfig &config)
|
||||
: config_(config),
|
||||
model_(std::make_unique<OfflineTtsKokoroModel>(mgr, config.model)) {
|
||||
InitFrontend(mgr);
|
||||
|
||||
if (!config.rule_fsts.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fsts, ",", false, &files);
|
||||
tn_list_.reserve(files.size());
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule fst: %{public}s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
auto buf = ReadFile(mgr, f);
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
tn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is));
|
||||
}
|
||||
}
|
||||
|
||||
if (!config.rule_fars.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fars, ",", false, &files);
|
||||
tn_list_.reserve(files.size() + tn_list_.size());
|
||||
|
||||
for (const auto &f : files) {
|
||||
if (config.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("rule far: %{public}s", f.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
auto buf = ReadFile(mgr, f);
|
||||
|
||||
std::unique_ptr<std::istream> s(
|
||||
new std::istrstream(buf.data(), buf.size()));
|
||||
|
||||
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
|
||||
fst::FarReader<fst::StdArc>::Open(std::move(s)));
|
||||
|
||||
for (; !reader->Done(); reader->Next()) {
|
||||
std::unique_ptr<fst::StdConstFst> r(
|
||||
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
|
||||
|
||||
tn_list_.push_back(
|
||||
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
|
||||
} // for (; !reader->Done(); reader->Next())
|
||||
} // for (const auto &f : files)
|
||||
} // if (!config.rule_fars.empty())
|
||||
}
|
||||
|
||||
int32_t SampleRate() const override {
|
||||
return model_->GetMetaData().sample_rate;
|
||||
}
|
||||
|
||||
int32_t NumSpeakers() const override {
|
||||
return model_->GetMetaData().num_speakers;
|
||||
}
|
||||
|
||||
GeneratedAudio Generate(
|
||||
const std::string &_text, int64_t sid = 0, float speed = 1.0,
|
||||
GeneratedAudioCallback callback = nullptr) const override {
|
||||
const auto &meta_data = model_->GetMetaData();
|
||||
int32_t num_speakers = meta_data.num_speakers;
|
||||
|
||||
if (num_speakers == 0 && sid != 0) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"This is a single-speaker model and supports only sid 0. Given sid: "
|
||||
"%{public}d. sid is ignored",
|
||||
static_cast<int32_t>(sid));
|
||||
#else
|
||||
SHERPA_ONNX_LOGE(
|
||||
"This is a single-speaker model and supports only sid 0. Given sid: "
|
||||
"%d. sid is ignored",
|
||||
static_cast<int32_t>(sid));
|
||||
#endif
|
||||
}
|
||||
|
||||
if (num_speakers != 0 && (sid >= num_speakers || sid < 0)) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"This model contains only %{public}d speakers. sid should be in the "
|
||||
"range [%{public}d, %{public}d]. Given: %{public}d. Use sid=0",
|
||||
num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
|
||||
#else
|
||||
SHERPA_ONNX_LOGE(
|
||||
"This model contains only %d speakers. sid should be in the range "
|
||||
"[%d, %d]. Given: %d. Use sid=0",
|
||||
num_speakers, 0, num_speakers - 1, static_cast<int32_t>(sid));
|
||||
#endif
|
||||
sid = 0;
|
||||
}
|
||||
|
||||
std::string text = _text;
|
||||
if (config_.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("Raw text: %{public}s", text.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("Raw text: %s", text.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!tn_list_.empty()) {
|
||||
for (const auto &tn : tn_list_) {
|
||||
text = tn->Normalize(text);
|
||||
if (config_.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("After normalizing: %{public}s", text.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("After normalizing: %s", text.c_str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> token_ids =
|
||||
frontend_->ConvertTextToTokenIds(text, "en-us");
|
||||
|
||||
if (token_ids.empty() ||
|
||||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("Failed to convert '%{public}s' to token IDs",
|
||||
text.c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("Failed to convert '%s' to token IDs", text.c_str());
|
||||
#endif
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> x;
|
||||
|
||||
x.reserve(token_ids.size());
|
||||
|
||||
for (auto &i : token_ids) {
|
||||
x.push_back(std::move(i.tokens));
|
||||
}
|
||||
|
||||
int32_t x_size = static_cast<int32_t>(x.size());
|
||||
|
||||
if (config_.max_num_sentences != 1) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"max_num_sentences (%{public}d) != 1 is ignored for Kokoro TTS "
|
||||
"models",
|
||||
config_.max_num_sentences);
|
||||
#else
|
||||
SHERPA_ONNX_LOGE(
|
||||
"max_num_sentences (%d) != 1 is ignored for Kokoro TTS models",
|
||||
config_.max_num_sentences);
|
||||
#endif
|
||||
}
|
||||
|
||||
// the input text is too long, we process sentences within it in batches
|
||||
// to avoid OOM. Batch size is config_.max_num_sentences
|
||||
std::vector<std::vector<int64_t>> batch_x;
|
||||
|
||||
int32_t batch_size = 1;
|
||||
batch_x.reserve(config_.max_num_sentences);
|
||||
int32_t num_batches = x_size / batch_size;
|
||||
|
||||
if (config_.model.debug) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Split it into %{public}d batches. batch size: "
|
||||
"%{public}d. Number of sentences: %{public}d",
|
||||
num_batches, batch_size, x_size);
|
||||
#else
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Split it into %d batches. batch size: %d. Number "
|
||||
"of sentences: %d",
|
||||
num_batches, batch_size, x_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
GeneratedAudio ans;
|
||||
|
||||
int32_t should_continue = 1;
|
||||
|
||||
int32_t k = 0;
|
||||
|
||||
for (int32_t b = 0; b != num_batches && should_continue; ++b) {
|
||||
batch_x.clear();
|
||||
for (int32_t i = 0; i != batch_size; ++i, ++k) {
|
||||
batch_x.push_back(std::move(x[k]));
|
||||
}
|
||||
|
||||
auto audio = Process(batch_x, sid, speed);
|
||||
ans.sample_rate = audio.sample_rate;
|
||||
ans.samples.insert(ans.samples.end(), audio.samples.begin(),
|
||||
audio.samples.end());
|
||||
if (callback) {
|
||||
should_continue = callback(audio.samples.data(), audio.samples.size(),
|
||||
(b + 1) * 1.0 / num_batches);
|
||||
// Caution(fangjun): audio is freed when the callback returns, so users
|
||||
// should copy the data if they want to access the data after
|
||||
// the callback returns to avoid segmentation fault.
|
||||
}
|
||||
}
|
||||
|
||||
batch_x.clear();
|
||||
while (k < static_cast<int32_t>(x.size()) && should_continue) {
|
||||
batch_x.push_back(std::move(x[k]));
|
||||
|
||||
++k;
|
||||
}
|
||||
|
||||
if (!batch_x.empty()) {
|
||||
auto audio = Process(batch_x, sid, speed);
|
||||
ans.sample_rate = audio.sample_rate;
|
||||
ans.samples.insert(ans.samples.end(), audio.samples.begin(),
|
||||
audio.samples.end());
|
||||
if (callback) {
|
||||
callback(audio.samples.data(), audio.samples.size(), 1.0);
|
||||
// Caution(fangjun): audio is freed when the callback returns, so users
|
||||
// should copy the data if they want to access the data after
|
||||
// the callback returns to avoid segmentation fault.
|
||||
}
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename Manager>
|
||||
void InitFrontend(Manager *mgr) {
|
||||
const auto &meta_data = model_->GetMetaData();
|
||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||
mgr, config_.model.kokoro.tokens, config_.model.kokoro.data_dir,
|
||||
meta_data);
|
||||
}
|
||||
|
||||
void InitFrontend() {
|
||||
const auto &meta_data = model_->GetMetaData();
|
||||
|
||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||
config_.model.kokoro.tokens, config_.model.kokoro.data_dir, meta_data);
|
||||
}
|
||||
|
||||
GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
|
||||
int32_t sid, float speed) const {
|
||||
int32_t num_tokens = 0;
|
||||
for (const auto &k : tokens) {
|
||||
num_tokens += k.size();
|
||||
}
|
||||
|
||||
std::vector<int64_t> x;
|
||||
x.reserve(num_tokens);
|
||||
for (const auto &k : tokens) {
|
||||
x.insert(x.end(), k.begin(), k.end());
|
||||
}
|
||||
|
||||
auto memory_info =
|
||||
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
|
||||
|
||||
std::array<int64_t, 2> x_shape = {1, static_cast<int32_t>(x.size())};
|
||||
Ort::Value x_tensor = Ort::Value::CreateTensor(
|
||||
memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
|
||||
|
||||
Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed);
|
||||
|
||||
std::vector<int64_t> audio_shape =
|
||||
audio.GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
int64_t total = 1;
|
||||
// The output shape may be (1, 1, total) or (1, total) or (total,)
|
||||
for (auto i : audio_shape) {
|
||||
total *= i;
|
||||
}
|
||||
|
||||
const float *p = audio.GetTensorData<float>();
|
||||
|
||||
GeneratedAudio ans;
|
||||
ans.sample_rate = model_->GetMetaData().sample_rate;
|
||||
ans.samples = std::vector<float>(p, p + total);
|
||||
return ans;
|
||||
}
|
||||
|
||||
private:
|
||||
OfflineTtsConfig config_;
|
||||
std::unique_ptr<OfflineTtsKokoroModel> model_;
|
||||
std::vector<std::unique_ptr<kaldifst::TextNormalizer>> tn_list_;
|
||||
std::unique_ptr<OfflineTtsFrontend> frontend_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_IMPL_H_
|
||||
96
sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc
Normal file
96
sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc
Normal file
@@ -0,0 +1,96 @@
|
||||
// sherpa-onnx/csrc/offline-tts-kokoro-model-config.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/file-utils.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
|
||||
po->Register("kokoro-model", &model, "Path to Kokoro model");
|
||||
po->Register("kokoro-voices", &voices,
|
||||
"Path to voices.bin for Kokoro models");
|
||||
po->Register("kokoro-tokens", &tokens,
|
||||
"Path to tokens.txt for Kokoro models");
|
||||
po->Register("kokoro-data-dir", &data_dir,
|
||||
"Path to the directory containing dict for espeak-ng.");
|
||||
po->Register("kokoro-length-scale", &length_scale,
|
||||
"Speech speed. Larger->Slower; Smaller->faster.");
|
||||
}
|
||||
|
||||
bool OfflineTtsKokoroModelConfig::Validate() const {
|
||||
if (model.empty()) {
|
||||
SHERPA_ONNX_LOGE("Please provide --kokoro-model");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(model)) {
|
||||
SHERPA_ONNX_LOGE("--kokoro-model: '%s' does not exist", model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (tokens.empty()) {
|
||||
SHERPA_ONNX_LOGE("Please provide --kokoro-tokens");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(tokens)) {
|
||||
SHERPA_ONNX_LOGE("--kokoro-tokens: '%s' does not exist", tokens.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (data_dir.empty()) {
|
||||
SHERPA_ONNX_LOGE("Please provide --kokoro-data-dir");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(data_dir + "/phontab")) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"'%s/phontab' does not exist. Please check --kokoro-data-dir",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(data_dir + "/phonindex")) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"'%s/phonindex' does not exist. Please check --kokoro-data-dir",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(data_dir + "/phondata")) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"'%s/phondata' does not exist. Please check --kokoro-data-dir",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(data_dir + "/intonations")) {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"'%s/intonations' does not exist. Please check --kokoro-data-dir",
|
||||
data_dir.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string OfflineTtsKokoroModelConfig::ToString() const {
|
||||
std::ostringstream os;
|
||||
|
||||
os << "OfflineTtsKokoroModelConfig(";
|
||||
os << "model=\"" << model << "\", ";
|
||||
os << "voices=\"" << voices << "\", ";
|
||||
os << "tokens=\"" << tokens << "\", ";
|
||||
os << "data_dir=\"" << data_dir << "\", ";
|
||||
os << "length_scale=" << length_scale << ")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
44
sherpa-onnx/csrc/offline-tts-kokoro-model-config.h
Normal file
44
sherpa-onnx/csrc/offline-tts-kokoro-model-config.h
Normal file
@@ -0,0 +1,44 @@
|
||||
// sherpa-onnx/csrc/offline-tts-kokoro-model-config.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/parse-options.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
struct OfflineTtsKokoroModelConfig {
|
||||
std::string model;
|
||||
std::string voices;
|
||||
std::string tokens;
|
||||
|
||||
std::string data_dir;
|
||||
|
||||
// speed = 1 / length_scale
|
||||
float length_scale = 1.0;
|
||||
|
||||
OfflineTtsKokoroModelConfig() = default;
|
||||
|
||||
OfflineTtsKokoroModelConfig(const std::string &model,
|
||||
const std::string &voices,
|
||||
const std::string &tokens,
|
||||
const std::string &data_dir, float length_scale)
|
||||
: model(model),
|
||||
voices(voices),
|
||||
tokens(tokens),
|
||||
data_dir(data_dir),
|
||||
length_scale(length_scale) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
|
||||
25
sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h
Normal file
25
sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h
Normal file
@@ -0,0 +1,25 @@
|
||||
// sherpa-onnx/csrc/offline-tts-kokoro-model-metadata.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
// please refer to
|
||||
// https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/kokoro/add-meta-data.py
|
||||
struct OfflineTtsKokoroModelMetaData {
|
||||
int32_t sample_rate = 0;
|
||||
int32_t num_speakers = 0;
|
||||
int32_t version = 1;
|
||||
int32_t has_espeak = 1;
|
||||
int32_t max_token_len = 0;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_META_DATA_H_
|
||||
251
sherpa-onnx/csrc/offline-tts-kokoro-model.cc
Normal file
251
sherpa-onnx/csrc/offline-tts-kokoro-model.cc
Normal file
@@ -0,0 +1,251 @@
|
||||
// sherpa-onnx/csrc/offline-tts-kokoro-model.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-model.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
#include "rawfile/raw_file_manager.h"
|
||||
#endif
|
||||
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/session.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineTtsKokoroModel::Impl {
|
||||
public:
|
||||
explicit Impl(const OfflineTtsModelConfig &config)
|
||||
: config_(config),
|
||||
env_(ORT_LOGGING_LEVEL_ERROR),
|
||||
sess_opts_(GetSessionOptions(config)),
|
||||
allocator_{} {
|
||||
auto model_buf = ReadFile(config.kokoro.model);
|
||||
auto voices_buf = ReadFile(config.kokoro.voices);
|
||||
Init(model_buf.data(), model_buf.size(), voices_buf.data(),
|
||||
voices_buf.size());
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
Impl(Manager *mgr, const OfflineTtsModelConfig &config)
|
||||
: config_(config),
|
||||
env_(ORT_LOGGING_LEVEL_ERROR),
|
||||
sess_opts_(GetSessionOptions(config)),
|
||||
allocator_{} {
|
||||
auto model_buf = ReadFile(mgr, config.kokoro.model);
|
||||
auto voices_buf = ReadFile(mgr, config.kokoro.voices);
|
||||
Init(model_buf.data(), model_buf.size(), voices_buf.data(),
|
||||
voices_buf.size());
|
||||
}
|
||||
|
||||
const OfflineTtsKokoroModelMetaData &GetMetaData() const {
|
||||
return meta_data_;
|
||||
}
|
||||
|
||||
Ort::Value Run(Ort::Value x, int32_t sid, float speed) {
|
||||
auto memory_info =
|
||||
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
|
||||
|
||||
std::vector<int64_t> x_shape = x.GetTensorTypeAndShapeInfo().GetShape();
|
||||
if (x_shape[0] != 1) {
|
||||
SHERPA_ONNX_LOGE("Support only batch_size == 1. Given: %d",
|
||||
static_cast<int32_t>(x_shape[0]));
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// there is a 0 at the front and end of x
|
||||
int32_t len = static_cast<int32_t>(x_shape[1]) - 2;
|
||||
int32_t num_speakers = meta_data_.num_speakers;
|
||||
int32_t dim0 = style_dim_[0];
|
||||
int32_t dim1 = style_dim_[2];
|
||||
if (len >= dim0) {
|
||||
SHERPA_ONNX_LOGE("Bad things happened! %d vs %d", len, dim0);
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
/*const*/ float *p = styles_.data() + sid * dim0 * dim1 + len * dim1;
|
||||
|
||||
std::array<int64_t, 2> style_embedding_shape = {1, dim1};
|
||||
Ort::Value style_embedding = Ort::Value::CreateTensor(
|
||||
memory_info, p, dim1, style_embedding_shape.data(),
|
||||
style_embedding_shape.size());
|
||||
|
||||
int64_t speed_shape = 1;
|
||||
|
||||
Ort::Value speed_tensor =
|
||||
Ort::Value::CreateTensor(memory_info, &speed, 1, &speed_shape, 1);
|
||||
|
||||
std::array<Ort::Value, 3> inputs = {
|
||||
std::move(x), std::move(style_embedding), std::move(speed_tensor)};
|
||||
|
||||
auto out =
|
||||
sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
|
||||
output_names_ptr_.data(), output_names_ptr_.size());
|
||||
|
||||
return std::move(out[0]);
|
||||
}
|
||||
|
||||
private:
|
||||
void Init(void *model_data, size_t model_data_length, const char *voices_data,
|
||||
size_t voices_data_length) {
|
||||
sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
|
||||
sess_opts_);
|
||||
|
||||
GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
|
||||
|
||||
GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
|
||||
// get meta data
|
||||
Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
|
||||
if (config_.debug) {
|
||||
std::ostringstream os;
|
||||
os << "---kokoro model---\n";
|
||||
PrintModelMetadata(os, meta_data);
|
||||
|
||||
os << "----------input names----------\n";
|
||||
int32_t i = 0;
|
||||
for (const auto &s : input_names_) {
|
||||
os << i << " " << s << "\n";
|
||||
++i;
|
||||
}
|
||||
os << "----------output names----------\n";
|
||||
i = 0;
|
||||
for (const auto &s : output_names_) {
|
||||
os << i << " " << s << "\n";
|
||||
++i;
|
||||
}
|
||||
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.sample_rate, "sample_rate");
|
||||
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
|
||||
|
||||
if (config_.debug) {
|
||||
std::vector<std::string> speaker_names;
|
||||
SHERPA_ONNX_READ_META_DATA_VEC_STRING(speaker_names, "speaker_names");
|
||||
std::ostringstream os;
|
||||
os << "\n";
|
||||
for (int32_t i = 0; i != speaker_names.size(); ++i) {
|
||||
os << i << "->" << speaker_names[i] << ", ";
|
||||
}
|
||||
os << "\n";
|
||||
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE("%{public}s\n", os.str().c_str());
|
||||
#else
|
||||
SHERPA_ONNX_LOGE("%s\n", os.str().c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
SHERPA_ONNX_READ_META_DATA_VEC(style_dim_, "style_dim");
|
||||
if (style_dim_.size() != 3) {
|
||||
SHERPA_ONNX_LOGE("style_dim should be 3-d, given: %d",
|
||||
static_cast<int32_t>(style_dim_.size()));
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
if (style_dim_[1] != 1) {
|
||||
SHERPA_ONNX_LOGE("style_dim[0] should be 1, given: %d", style_dim_[1]);
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
int32_t actual_num_floats = voices_data_length / sizeof(float);
|
||||
int32_t expected_num_floats =
|
||||
style_dim_[0] * style_dim_[2] * meta_data_.num_speakers;
|
||||
|
||||
if (actual_num_floats != expected_num_floats) {
|
||||
#if __OHOS__
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Corrupted --kokoro-voices '%{public}s'. Expected #floats: "
|
||||
"%{public}d, actual: %{public}d",
|
||||
config_.kokoro.voices.c_str(), expected_num_floats,
|
||||
actual_num_floats);
|
||||
#else
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Corrupted --kokoro-voices '%s'. Expected #floats: %d, actual: %d",
|
||||
config_.kokoro.voices.c_str(), expected_num_floats,
|
||||
actual_num_floats);
|
||||
#endif
|
||||
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
styles_ = std::vector<float>(
|
||||
reinterpret_cast<const float *>(voices_data),
|
||||
reinterpret_cast<const float *>(voices_data) + expected_num_floats);
|
||||
|
||||
meta_data_.max_token_len = style_dim_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
OfflineTtsModelConfig config_;
|
||||
Ort::Env env_;
|
||||
Ort::SessionOptions sess_opts_;
|
||||
Ort::AllocatorWithDefaultOptions allocator_;
|
||||
|
||||
std::unique_ptr<Ort::Session> sess_;
|
||||
|
||||
std::vector<std::string> input_names_;
|
||||
std::vector<const char *> input_names_ptr_;
|
||||
|
||||
std::vector<std::string> output_names_;
|
||||
std::vector<const char *> output_names_ptr_;
|
||||
|
||||
OfflineTtsKokoroModelMetaData meta_data_;
|
||||
std::vector<int32_t> style_dim_;
|
||||
|
||||
// (num_speakers, style_dim_[0], style_dim_[2])
|
||||
std::vector<float> styles_;
|
||||
};
|
||||
|
||||
OfflineTtsKokoroModel::OfflineTtsKokoroModel(
|
||||
const OfflineTtsModelConfig &config)
|
||||
: impl_(std::make_unique<Impl>(config)) {}
|
||||
|
||||
template <typename Manager>
|
||||
OfflineTtsKokoroModel::OfflineTtsKokoroModel(
|
||||
Manager *mgr, const OfflineTtsModelConfig &config)
|
||||
: impl_(std::make_unique<Impl>(mgr, config)) {}
|
||||
|
||||
OfflineTtsKokoroModel::~OfflineTtsKokoroModel() = default;
|
||||
|
||||
const OfflineTtsKokoroModelMetaData &OfflineTtsKokoroModel::GetMetaData()
|
||||
const {
|
||||
return impl_->GetMetaData();
|
||||
}
|
||||
|
||||
Ort::Value OfflineTtsKokoroModel::Run(Ort::Value x, int64_t sid /*= 0*/,
|
||||
float speed /*= 1.0*/) const {
|
||||
return impl_->Run(std::move(x), sid, speed);
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
template OfflineTtsKokoroModel::OfflineTtsKokoroModel(
|
||||
AAssetManager *mgr, const OfflineTtsModelConfig &config);
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
template OfflineTtsKokoroModel::OfflineTtsKokoroModel(
|
||||
NativeResourceManager *mgr, const OfflineTtsModelConfig &config);
|
||||
#endif
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
39
sherpa-onnx/csrc/offline-tts-kokoro-model.h
Normal file
39
sherpa-onnx/csrc/offline-tts-kokoro-model.h
Normal file
@@ -0,0 +1,39 @@
|
||||
// sherpa-onnx/csrc/offline-tts-kokoro-model.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "onnxruntime_cxx_api.h" // NOLINT
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
class OfflineTtsKokoroModel {
|
||||
public:
|
||||
~OfflineTtsKokoroModel();
|
||||
|
||||
explicit OfflineTtsKokoroModel(const OfflineTtsModelConfig &config);
|
||||
|
||||
template <typename Manager>
|
||||
OfflineTtsKokoroModel(Manager *mgr, const OfflineTtsModelConfig &config);
|
||||
|
||||
// Return a float32 tensor containing the mel
|
||||
// of shape (batch_size, mel_dim, num_frames)
|
||||
Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0) const;
|
||||
|
||||
const OfflineTtsKokoroModelMetaData &GetMetaData() const;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_KOKORO_MODEL_H_
|
||||
@@ -1,9 +1,9 @@
|
||||
// sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h
|
||||
// sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
@@ -25,4 +25,4 @@ struct OfflineTtsMatchaModelMetaData {
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_METADATA_H_
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_MATCHA_MODEL_META_DATA_H_
|
||||
@@ -9,7 +9,7 @@
|
||||
#include <string>
|
||||
|
||||
#include "onnxruntime_cxx_api.h" // NOLINT
|
||||
#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
@@ -11,6 +11,7 @@ namespace sherpa_onnx {
|
||||
void OfflineTtsModelConfig::Register(ParseOptions *po) {
|
||||
vits.Register(po);
|
||||
matcha.Register(po);
|
||||
kokoro.Register(po);
|
||||
|
||||
po->Register("num-threads", &num_threads,
|
||||
"Number of threads to run the neural network");
|
||||
@@ -32,7 +33,11 @@ bool OfflineTtsModelConfig::Validate() const {
|
||||
return vits.Validate();
|
||||
}
|
||||
|
||||
return matcha.Validate();
|
||||
if (!matcha.acoustic_model.empty()) {
|
||||
return matcha.Validate();
|
||||
}
|
||||
|
||||
return kokoro.Validate();
|
||||
}
|
||||
|
||||
std::string OfflineTtsModelConfig::ToString() const {
|
||||
@@ -41,6 +46,7 @@ std::string OfflineTtsModelConfig::ToString() const {
|
||||
os << "OfflineTtsModelConfig(";
|
||||
os << "vits=" << vits.ToString() << ", ";
|
||||
os << "matcha=" << matcha.ToString() << ", ";
|
||||
os << "kokoro=" << kokoro.ToString() << ", ";
|
||||
os << "num_threads=" << num_threads << ", ";
|
||||
os << "debug=" << (debug ? "True" : "False") << ", ";
|
||||
os << "provider=\"" << provider << "\")";
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-matcha-model-config.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-config.h"
|
||||
#include "sherpa-onnx/csrc/parse-options.h"
|
||||
@@ -16,6 +17,7 @@ namespace sherpa_onnx {
|
||||
struct OfflineTtsModelConfig {
|
||||
OfflineTtsVitsModelConfig vits;
|
||||
OfflineTtsMatchaModelConfig matcha;
|
||||
OfflineTtsKokoroModelConfig kokoro;
|
||||
|
||||
int32_t num_threads = 1;
|
||||
bool debug = false;
|
||||
@@ -25,10 +27,12 @@ struct OfflineTtsModelConfig {
|
||||
|
||||
OfflineTtsModelConfig(const OfflineTtsVitsModelConfig &vits,
|
||||
const OfflineTtsMatchaModelConfig &matcha,
|
||||
const OfflineTtsKokoroModelConfig &kokoro,
|
||||
int32_t num_threads, bool debug,
|
||||
const std::string &provider)
|
||||
: vits(vits),
|
||||
matcha(matcha),
|
||||
kokoro(kokoro),
|
||||
num_threads(num_threads),
|
||||
debug(debug),
|
||||
provider(provider) {}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
// sherpa-onnx/csrc/offline-tts-vits-model-metadata.h
|
||||
// sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h
|
||||
//
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_
|
||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
@@ -46,4 +46,4 @@ struct OfflineTtsVitsModelMetaData {
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_METADATA_H_
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_VITS_MODEL_META_DATA_H_
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
#include "onnxruntime_cxx_api.h" // NOLINT
|
||||
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
|
||||
@@ -155,6 +155,36 @@ static std::vector<int64_t> PiperPhonemesToIdsMatcha(
|
||||
return ans;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<int64_t>> PiperPhonemesToIdsKokoro(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
const std::vector<piper::Phoneme> &phonemes, int32_t max_len) {
|
||||
std::vector<std::vector<int64_t>> ans;
|
||||
|
||||
std::vector<int64_t> current;
|
||||
current.reserve(phonemes.size());
|
||||
|
||||
for (auto p : phonemes) {
|
||||
if (token2id.count(p)) {
|
||||
if (current.size() > max_len - 1) {
|
||||
current.push_back(0);
|
||||
ans.push_back(std::move(current));
|
||||
|
||||
current.reserve(phonemes.size());
|
||||
current.push_back(0);
|
||||
}
|
||||
|
||||
current.push_back(token2id.at(p));
|
||||
} else {
|
||||
SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
|
||||
static_cast<uint32_t>(p));
|
||||
}
|
||||
}
|
||||
|
||||
current.push_back(0);
|
||||
ans.push_back(std::move(current));
|
||||
return ans;
|
||||
}
|
||||
|
||||
static std::vector<int64_t> CoquiPhonemesToIds(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
const std::vector<piper::Phoneme> &phonemes,
|
||||
@@ -269,6 +299,18 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
InitEspeak(data_dir);
|
||||
}
|
||||
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsKokoroModelMetaData &kokoro_meta_data)
|
||||
: kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) {
|
||||
{
|
||||
std::ifstream is(tokens);
|
||||
token2id_ = ReadTokens(is);
|
||||
}
|
||||
|
||||
InitEspeak(data_dir);
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
Manager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
@@ -286,10 +328,29 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
InitEspeak(data_dir);
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
Manager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsKokoroModelMetaData &kokoro_meta_data)
|
||||
: kokoro_meta_data_(kokoro_meta_data), is_kokoro_(true) {
|
||||
{
|
||||
auto buf = ReadFile(mgr, tokens);
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
token2id_ = ReadTokens(is);
|
||||
}
|
||||
|
||||
// We should copy the directory of espeak-ng-data from the asset to
|
||||
// some internal or external storage and then pass the directory to
|
||||
// data_dir.
|
||||
InitEspeak(data_dir);
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
if (is_matcha_) {
|
||||
return ConvertTextToTokenIdsMatcha(text, voice);
|
||||
} else if (is_kokoro_) {
|
||||
return ConvertTextToTokenIdsKokoro(text, voice);
|
||||
} else {
|
||||
return ConvertTextToTokenIdsVits(text, voice);
|
||||
}
|
||||
@@ -320,6 +381,32 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
|
||||
// to list available voices
|
||||
config.voice = voice; // e.g., voice is en-us
|
||||
|
||||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||
|
||||
CallPhonemizeEspeak(text, config, &phonemes);
|
||||
|
||||
std::vector<TokenIDs> ans;
|
||||
|
||||
for (const auto &p : phonemes) {
|
||||
auto phoneme_ids =
|
||||
PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len);
|
||||
|
||||
for (auto &ids : phoneme_ids) {
|
||||
ans.emplace_back(std::move(ids));
|
||||
}
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsVits(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
@@ -363,6 +450,10 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
|
||||
template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
@@ -375,6 +466,11 @@ template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
NativeResourceManager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
|
||||
template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
NativeResourceManager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir,
|
||||
const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
|
||||
#endif
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -10,8 +10,9 @@
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-model-meta-data.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-matcha-model-meta-data.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
@@ -23,6 +24,9 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
|
||||
PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
|
||||
PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
|
||||
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir,
|
||||
@@ -33,6 +37,11 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
|
||||
const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir,
|
||||
const OfflineTtsKokoroModelMetaData &kokoro_meta_data);
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice = "") const override;
|
||||
|
||||
@@ -43,12 +52,17 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsMatcha(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
private:
|
||||
// map unicode codepoint to an integer ID
|
||||
std::unordered_map<char32_t, int32_t> token2id_;
|
||||
OfflineTtsVitsModelMetaData vits_meta_data_;
|
||||
OfflineTtsMatchaModelMetaData matcha_meta_data_;
|
||||
OfflineTtsKokoroModelMetaData kokoro_meta_data_;
|
||||
bool is_matcha_ = false;
|
||||
bool is_kokoro_ = false;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -54,6 +54,7 @@ endif()
|
||||
|
||||
if(SHERPA_ONNX_ENABLE_TTS)
|
||||
list(APPEND srcs
|
||||
offline-tts-kokoro-model-config.cc
|
||||
offline-tts-matcha-model-config.cc
|
||||
offline-tts-model-config.cc
|
||||
offline-tts-vits-model-config.cc
|
||||
|
||||
31
sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc
Normal file
31
sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc
Normal file
@@ -0,0 +1,31 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-kokoro-model-config.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void PybindOfflineTtsKokoroModelConfig(py::module *m) {
|
||||
using PyClass = OfflineTtsKokoroModelConfig;
|
||||
|
||||
py::class_<PyClass>(*m, "OfflineTtsKokoroModelConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const std::string &, const std::string &,
|
||||
const std::string &, const std::string &, float>(),
|
||||
py::arg("model"), py::arg("voices"), py::arg("tokens"),
|
||||
py::arg("data_dir"), py::arg("length_scale") = 1.0)
|
||||
.def_readwrite("model", &PyClass::model)
|
||||
.def_readwrite("voices", &PyClass::voices)
|
||||
.def_readwrite("tokens", &PyClass::tokens)
|
||||
.def_readwrite("data_dir", &PyClass::data_dir)
|
||||
.def_readwrite("length_scale", &PyClass::length_scale)
|
||||
.def("__str__", &PyClass::ToString)
|
||||
.def("validate", &PyClass::Validate);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
16
sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h
Normal file
16
sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h
Normal file
@@ -0,0 +1,16 @@
|
||||
// sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#ifndef SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
|
||||
#define SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
|
||||
|
||||
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void PybindOfflineTtsKokoroModelConfig(py::module *m);
|
||||
|
||||
}
|
||||
|
||||
#endif // SHERPA_ONNX_PYTHON_CSRC_OFFLINE_TTS_KOKORO_MODEL_CONFIG_H_
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-model-config.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-kokoro-model-config.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-matcha-model-config.h"
|
||||
#include "sherpa-onnx/python/csrc/offline-tts-vits-model-config.h"
|
||||
|
||||
@@ -15,20 +16,24 @@ namespace sherpa_onnx {
|
||||
void PybindOfflineTtsModelConfig(py::module *m) {
|
||||
PybindOfflineTtsVitsModelConfig(m);
|
||||
PybindOfflineTtsMatchaModelConfig(m);
|
||||
PybindOfflineTtsKokoroModelConfig(m);
|
||||
|
||||
using PyClass = OfflineTtsModelConfig;
|
||||
|
||||
py::class_<PyClass>(*m, "OfflineTtsModelConfig")
|
||||
.def(py::init<>())
|
||||
.def(py::init<const OfflineTtsVitsModelConfig &,
|
||||
const OfflineTtsMatchaModelConfig &, int32_t, bool,
|
||||
const OfflineTtsMatchaModelConfig &,
|
||||
const OfflineTtsKokoroModelConfig &, int32_t, bool,
|
||||
const std::string &>(),
|
||||
py::arg("vits") = OfflineTtsVitsModelConfig{},
|
||||
py::arg("matcha") = OfflineTtsMatchaModelConfig{},
|
||||
py::arg("kokoro") = OfflineTtsKokoroModelConfig{},
|
||||
py::arg("num_threads") = 1, py::arg("debug") = false,
|
||||
py::arg("provider") = "cpu")
|
||||
.def_readwrite("vits", &PyClass::vits)
|
||||
.def_readwrite("matcha", &PyClass::matcha)
|
||||
.def_readwrite("kokoro", &PyClass::kokoro)
|
||||
.def_readwrite("num_threads", &PyClass::num_threads)
|
||||
.def_readwrite("debug", &PyClass::debug)
|
||||
.def_readwrite("provider", &PyClass::provider)
|
||||
|
||||
@@ -20,6 +20,7 @@ from _sherpa_onnx import (
|
||||
OfflineStream,
|
||||
OfflineTts,
|
||||
OfflineTtsConfig,
|
||||
OfflineTtsKokoroModelConfig,
|
||||
OfflineTtsMatchaModelConfig,
|
||||
OfflineTtsModelConfig,
|
||||
OfflineTtsVitsModelConfig,
|
||||
|
||||
Reference in New Issue
Block a user