Support Matcha-TTS models using espeak-ng (#1672)
This commit is contained in:
22
.github/scripts/test-offline-tts.sh
vendored
22
.github/scripts/test-offline-tts.sh
vendored
@@ -18,6 +18,28 @@ which $EXE
|
||||
# test waves are saved in ./tts
|
||||
mkdir ./tts
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "matcha-icefall-en_US-ljspeech"
|
||||
log "------------------------------------------------------------"
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||
|
||||
$EXE \
|
||||
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||
--matcha-vocoder=./hifigan_v2.onnx \
|
||||
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
|
||||
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||
--num-threads=2 \
|
||||
--output-filename=./tts/matcha-ljspeech-1.wav \
|
||||
--debug=1 \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
rm hifigan_v2.onnx
|
||||
rm -rf matcha-icefall-en_US-ljspeech
|
||||
|
||||
log "------------------------------------------------------------"
|
||||
log "matcha-icefall-zh-baker"
|
||||
log "------------------------------------------------------------"
|
||||
|
||||
25
.github/scripts/test-python.sh
vendored
25
.github/scripts/test-python.sh
vendored
@@ -267,7 +267,27 @@ log "Offline TTS test"
|
||||
# test waves are saved in ./tts
|
||||
mkdir ./tts
|
||||
|
||||
log "vits-ljs test"
|
||||
log "matcha-ljspeech-en test"
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||
|
||||
python3 ./python-api-examples/offline-tts.py \
|
||||
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||
--matcha-vocoder=./hifigan_v2.onnx \
|
||||
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
|
||||
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||
--output-filename=./tts/test-matcha-ljspeech-en.wav \
|
||||
--num-threads=2 \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
rm hifigan_v2.onnx
|
||||
rm -rf matcha-icefall-en_US-ljspeech
|
||||
|
||||
log "matcha-baker-zh test"
|
||||
|
||||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||
@@ -282,12 +302,13 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \
|
||||
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
|
||||
--matcha-dict-dir=./matcha-icefall-zh-baker/dict \
|
||||
--output-filename=./tts/test-matcha.wav \
|
||||
--output-filename=./tts/test-matcha-baker-zh.wav \
|
||||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
rm -rf matcha-icefall-zh-baker
|
||||
rm hifigan_v2.onnx
|
||||
|
||||
log "vits-ljs test"
|
||||
|
||||
curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx
|
||||
curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt
|
||||
|
||||
@@ -11,7 +11,7 @@ while the model is still generating.
|
||||
|
||||
Usage:
|
||||
|
||||
Example (1/4)
|
||||
Example (1/5)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||
@@ -23,7 +23,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./generated.wav \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
Example (2/4)
|
||||
Example (2/5)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
|
||||
tar xvf vits-zh-aishell3.tar.bz2
|
||||
@@ -37,7 +37,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./liubei-21.wav \
|
||||
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
||||
|
||||
Example (3/4)
|
||||
Example (3/5)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
@@ -53,7 +53,7 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./test-2.wav \
|
||||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (4/4)
|
||||
Example (4/5)
|
||||
|
||||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||
@@ -71,6 +71,23 @@ python3 ./python-api-examples/offline-tts-play.py \
|
||||
--output-filename=./test-matcha.wav \
|
||||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (5/5)
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||
|
||||
python3 ./python-api-examples/offline-tts-play.py \
|
||||
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||
--matcha-vocoder=./hifigan_v2.onnx \
|
||||
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
|
||||
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||
--output-filename=./test-matcha-ljspeech-en.wav \
|
||||
--num-threads=2 \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
|
||||
You can find more models at
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
|
||||
@@ -12,7 +12,7 @@ generated audio.
|
||||
|
||||
Usage:
|
||||
|
||||
Example (1/4)
|
||||
Example (1/5)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
|
||||
tar xf vits-piper-en_US-amy-low.tar.bz2
|
||||
@@ -24,7 +24,7 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./generated.wav \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
Example (2/4)
|
||||
Example (2/5)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
|
||||
tar xvf vits-icefall-zh-aishell3.tar.bz2
|
||||
@@ -38,7 +38,7 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./liubei-21.wav \
|
||||
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
|
||||
|
||||
Example (3/4)
|
||||
Example (3/5)
|
||||
|
||||
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
|
||||
@@ -54,7 +54,7 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./test-2.wav \
|
||||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (4/4)
|
||||
Example (4/5)
|
||||
|
||||
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
|
||||
tar xvf matcha-icefall-zh-baker.tar.bz2
|
||||
@@ -72,6 +72,23 @@ python3 ./python-api-examples/offline-tts.py \
|
||||
--output-filename=./test-matcha.wav \
|
||||
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
|
||||
|
||||
Example (5/5)
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
rm matcha-icefall-en_US-ljspeech.tar.bz2
|
||||
|
||||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
|
||||
|
||||
python3 ./python-api-examples/offline-tts.py \
|
||||
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
|
||||
--matcha-vocoder=./hifigan_v2.onnx \
|
||||
--matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
|
||||
--matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
|
||||
--output-filename=./test-matcha-ljspeech-en.wav \
|
||||
--num-threads=2 \
|
||||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
|
||||
|
||||
You can find more models at
|
||||
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
|
||||
|
||||
|
||||
@@ -49,19 +49,21 @@
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define SHERPA_ONNX_EXIT(code) exit(code)
|
||||
|
||||
// Read an integer
|
||||
#define SHERPA_ONNX_READ_META_DATA(dst, src_key) \
|
||||
do { \
|
||||
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
|
||||
if (value.empty()) { \
|
||||
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
\
|
||||
dst = atoi(value.c_str()); \
|
||||
if (dst < 0) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -74,7 +76,7 @@
|
||||
dst = atoi(value.c_str()); \
|
||||
if (dst < 0) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
@@ -85,13 +87,13 @@
|
||||
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
|
||||
if (value.empty()) { \
|
||||
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
\
|
||||
bool ret = SplitStringToIntegers(value.c_str(), ",", true, &dst); \
|
||||
if (!ret) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -101,13 +103,13 @@
|
||||
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
|
||||
if (value.empty()) { \
|
||||
SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
\
|
||||
bool ret = SplitStringToFloats(value.c_str(), ",", true, &dst); \
|
||||
if (!ret) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -117,14 +119,14 @@
|
||||
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
|
||||
if (value.empty()) { \
|
||||
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
SplitStringToVector(value.c_str(), ",", false, &dst); \
|
||||
\
|
||||
if (dst.empty()) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \
|
||||
value.c_str(), src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -134,14 +136,14 @@
|
||||
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
|
||||
if (value.empty()) { \
|
||||
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
SplitStringToVector(value.c_str(), sep, false, &dst); \
|
||||
\
|
||||
if (dst.empty()) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \
|
||||
value.c_str(), src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -151,13 +153,13 @@
|
||||
auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \
|
||||
if (value.empty()) { \
|
||||
SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
\
|
||||
dst = std::move(value); \
|
||||
if (dst.empty()) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -178,11 +180,9 @@
|
||||
dst = std::move(value); \
|
||||
if (dst.empty()) { \
|
||||
SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \
|
||||
exit(-1); \
|
||||
SHERPA_ONNX_EXIT(-1); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SHERPA_ONNX_EXIT(code) exit(code)
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_MACROS_H_
|
||||
|
||||
@@ -321,12 +321,45 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl {
|
||||
|
||||
private:
|
||||
template <typename Manager>
|
||||
void InitFrontend(Manager *mgr) {}
|
||||
void InitFrontend(Manager *mgr) {
|
||||
// for piper phonemizer
|
||||
// we require that you copy espeak_ng_data
|
||||
// from assets to disk
|
||||
//
|
||||
// for jieba
|
||||
// we require that you copy tokens.txt, lexicon.txt and dict
|
||||
// from assets to disk
|
||||
const auto &meta_data = model_->GetMetaData();
|
||||
|
||||
if (meta_data.jieba && !meta_data.has_espeak) {
|
||||
frontend_ = std::make_unique<JiebaLexicon>(
|
||||
config_.model.matcha.lexicon, config_.model.matcha.tokens,
|
||||
config_.model.matcha.dict_dir, config_.model.debug);
|
||||
} else if (meta_data.has_espeak && !meta_data.jieba) {
|
||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||
mgr, config_.model.matcha.tokens, config_.model.matcha.data_dir,
|
||||
meta_data);
|
||||
} else {
|
||||
SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet");
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void InitFrontend() {
|
||||
frontend_ = std::make_unique<JiebaLexicon>(
|
||||
config_.model.matcha.lexicon, config_.model.matcha.tokens,
|
||||
config_.model.matcha.dict_dir, config_.model.debug);
|
||||
const auto &meta_data = model_->GetMetaData();
|
||||
|
||||
if (meta_data.jieba && !meta_data.has_espeak) {
|
||||
frontend_ = std::make_unique<JiebaLexicon>(
|
||||
config_.model.matcha.lexicon, config_.model.matcha.tokens,
|
||||
config_.model.matcha.dict_dir, config_.model.debug);
|
||||
} else if (meta_data.has_espeak && !meta_data.jieba) {
|
||||
frontend_ = std::make_unique<PiperPhonemizeLexicon>(
|
||||
config_.model.matcha.tokens, config_.model.matcha.data_dir,
|
||||
meta_data);
|
||||
} else {
|
||||
SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet");
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
}
|
||||
|
||||
GeneratedAudio Process(const std::vector<std::vector<int64_t>> &tokens,
|
||||
|
||||
@@ -18,7 +18,7 @@ struct OfflineTtsMatchaModelMetaData {
|
||||
int32_t num_speakers = 0;
|
||||
int32_t version = 1;
|
||||
int32_t jieba = 0;
|
||||
int32_t espeak = 0;
|
||||
int32_t has_espeak = 0;
|
||||
int32_t use_eos_bos = 0;
|
||||
int32_t pad_id = 0;
|
||||
};
|
||||
|
||||
@@ -142,7 +142,7 @@ class OfflineTtsMatchaModel::Impl {
|
||||
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.jieba, "jieba");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.espeak, "has_espeak");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos");
|
||||
SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id");
|
||||
}
|
||||
|
||||
@@ -32,6 +32,18 @@
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
static void CallPhonemizeEspeak(
|
||||
const std::string &text,
|
||||
piper::eSpeakPhonemeConfig &config, // NOLINT
|
||||
std::vector<std::vector<piper::Phoneme>> *phonemes) {
|
||||
static std::mutex espeak_mutex;
|
||||
|
||||
std::lock_guard<std::mutex> lock(espeak_mutex);
|
||||
|
||||
// keep multi threads from calling into piper::phonemize_eSpeak
|
||||
piper::phonemize_eSpeak(text, config, *phonemes);
|
||||
}
|
||||
|
||||
static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
|
||||
std::unordered_map<char32_t, int32_t> token2id;
|
||||
@@ -87,7 +99,7 @@ static std::unordered_map<char32_t, int32_t> ReadTokens(std::istream &is) {
|
||||
|
||||
// see the function "phonemes_to_ids" from
|
||||
// https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb
|
||||
static std::vector<int64_t> PiperPhonemesToIds(
|
||||
static std::vector<int64_t> PiperPhonemesToIdsVits(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
const std::vector<piper::Phoneme> &phonemes) {
|
||||
// see
|
||||
@@ -114,17 +126,46 @@ static std::vector<int64_t> PiperPhonemesToIds(
|
||||
return ans;
|
||||
}
|
||||
|
||||
static std::vector<int64_t> PiperPhonemesToIdsMatcha(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
const std::vector<piper::Phoneme> &phonemes, bool use_eos_bos) {
|
||||
std::vector<int64_t> ans;
|
||||
ans.reserve(phonemes.size());
|
||||
|
||||
int32_t bos = token2id.at(U'^');
|
||||
int32_t eos = token2id.at(U'$');
|
||||
|
||||
if (use_eos_bos) {
|
||||
ans.push_back(bos);
|
||||
}
|
||||
|
||||
for (auto p : phonemes) {
|
||||
if (token2id.count(p)) {
|
||||
ans.push_back(token2id.at(p));
|
||||
} else {
|
||||
SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.",
|
||||
static_cast<uint32_t>(p));
|
||||
}
|
||||
}
|
||||
|
||||
if (use_eos_bos) {
|
||||
ans.push_back(eos);
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
static std::vector<int64_t> CoquiPhonemesToIds(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
const std::vector<piper::Phoneme> &phonemes,
|
||||
const OfflineTtsVitsModelMetaData &meta_data) {
|
||||
const OfflineTtsVitsModelMetaData &vits_meta_data) {
|
||||
// see
|
||||
// https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py#L87
|
||||
int32_t use_eos_bos = meta_data.use_eos_bos;
|
||||
int32_t bos_id = meta_data.bos_id;
|
||||
int32_t eos_id = meta_data.eos_id;
|
||||
int32_t blank_id = meta_data.blank_id;
|
||||
int32_t add_blank = meta_data.add_blank;
|
||||
int32_t use_eos_bos = vits_meta_data.use_eos_bos;
|
||||
int32_t bos_id = vits_meta_data.bos_id;
|
||||
int32_t eos_id = vits_meta_data.eos_id;
|
||||
int32_t blank_id = vits_meta_data.blank_id;
|
||||
int32_t add_blank = vits_meta_data.add_blank;
|
||||
int32_t comma_id = token2id.at(',');
|
||||
|
||||
std::vector<int64_t> ans;
|
||||
@@ -189,8 +230,8 @@ static void InitEspeak(const std::string &data_dir) {
|
||||
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data)
|
||||
: meta_data_(meta_data) {
|
||||
const OfflineTtsVitsModelMetaData &vits_meta_data)
|
||||
: vits_meta_data_(vits_meta_data) {
|
||||
{
|
||||
std::ifstream is(tokens);
|
||||
token2id_ = ReadTokens(is);
|
||||
@@ -202,8 +243,37 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
Manager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data)
|
||||
: meta_data_(meta_data) {
|
||||
const OfflineTtsVitsModelMetaData &vits_meta_data)
|
||||
: vits_meta_data_(vits_meta_data) {
|
||||
{
|
||||
auto buf = ReadFile(mgr, tokens);
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
token2id_ = ReadTokens(is);
|
||||
}
|
||||
|
||||
// We should copy the directory of espeak-ng-data from the asset to
|
||||
// some internal or external storage and then pass the directory to
|
||||
// data_dir.
|
||||
InitEspeak(data_dir);
|
||||
}
|
||||
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data)
|
||||
: matcha_meta_data_(matcha_meta_data), is_matcha_(true) {
|
||||
{
|
||||
std::ifstream is(tokens);
|
||||
token2id_ = ReadTokens(is);
|
||||
}
|
||||
|
||||
InitEspeak(data_dir);
|
||||
}
|
||||
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
Manager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data)
|
||||
: matcha_meta_data_(matcha_meta_data), is_matcha_(true) {
|
||||
{
|
||||
auto buf = ReadFile(mgr, tokens);
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
@@ -218,6 +288,15 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
if (is_matcha_) {
|
||||
return ConvertTextToTokenIdsMatcha(text, voice);
|
||||
} else {
|
||||
return ConvertTextToTokenIdsVits(text, voice);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
|
||||
@@ -226,26 +305,45 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
|
||||
|
||||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||
|
||||
static std::mutex espeak_mutex;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(espeak_mutex);
|
||||
|
||||
// keep multi threads from calling into piper::phonemize_eSpeak
|
||||
piper::phonemize_eSpeak(text, config, phonemes);
|
||||
}
|
||||
CallPhonemizeEspeak(text, config, &phonemes);
|
||||
|
||||
std::vector<TokenIDs> ans;
|
||||
|
||||
std::vector<int64_t> phoneme_ids;
|
||||
|
||||
if (meta_data_.is_piper || meta_data_.is_icefall) {
|
||||
for (const auto &p : phonemes) {
|
||||
phoneme_ids =
|
||||
PiperPhonemesToIdsMatcha(token2id_, p, matcha_meta_data_.use_eos_bos);
|
||||
ans.emplace_back(std::move(phoneme_ids));
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsVits(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
|
||||
// to list available voices
|
||||
config.voice = voice; // e.g., voice is en-us
|
||||
|
||||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||
|
||||
CallPhonemizeEspeak(text, config, &phonemes);
|
||||
|
||||
std::vector<TokenIDs> ans;
|
||||
|
||||
std::vector<int64_t> phoneme_ids;
|
||||
|
||||
if (vits_meta_data_.is_piper || vits_meta_data_.is_icefall) {
|
||||
for (const auto &p : phonemes) {
|
||||
phoneme_ids = PiperPhonemesToIds(token2id_, p);
|
||||
phoneme_ids = PiperPhonemesToIdsVits(token2id_, p);
|
||||
ans.emplace_back(std::move(phoneme_ids));
|
||||
}
|
||||
} else if (meta_data_.is_coqui) {
|
||||
} else if (vits_meta_data_.is_coqui) {
|
||||
for (const auto &p : phonemes) {
|
||||
phoneme_ids = CoquiPhonemesToIds(token2id_, p, meta_data_);
|
||||
phoneme_ids = CoquiPhonemesToIds(token2id_, p, vits_meta_data_);
|
||||
ans.emplace_back(std::move(phoneme_ids));
|
||||
}
|
||||
|
||||
@@ -260,13 +358,18 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
|
||||
#if __ANDROID_API__ >= 9
|
||||
template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data);
|
||||
const OfflineTtsVitsModelMetaData &vits_meta_data);
|
||||
|
||||
template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
AAssetManager *mgr, const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
#endif
|
||||
|
||||
#if __OHOS__
|
||||
template PiperPhonemizeLexicon::PiperPhonemizeLexicon(
|
||||
NativeResourceManager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir, const OfflineTtsVitsModelMetaData &meta_data);
|
||||
const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
#endif
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/offline-tts-frontend.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h"
|
||||
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
@@ -17,20 +18,37 @@ namespace sherpa_onnx {
|
||||
class PiperPhonemizeLexicon : public OfflineTtsFrontend {
|
||||
public:
|
||||
PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data);
|
||||
const OfflineTtsVitsModelMetaData &vits_meta_data);
|
||||
|
||||
PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir,
|
||||
const OfflineTtsVitsModelMetaData &meta_data);
|
||||
const OfflineTtsVitsModelMetaData &vits_meta_data);
|
||||
|
||||
template <typename Manager>
|
||||
PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens,
|
||||
const std::string &data_dir,
|
||||
const OfflineTtsMatchaModelMetaData &matcha_meta_data);
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string &voice = "") const override;
|
||||
|
||||
private:
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsVits(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsMatcha(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
private:
|
||||
// map unicode codepoint to an integer ID
|
||||
std::unordered_map<char32_t, int32_t> token2id_;
|
||||
OfflineTtsVitsModelMetaData meta_data_;
|
||||
OfflineTtsVitsModelMetaData vits_meta_data_;
|
||||
OfflineTtsMatchaModelMetaData matcha_meta_data_;
|
||||
bool is_matcha_ = false;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
Reference in New Issue
Block a user