Support specifying voice in espeak-ng for kokoro tts models. (#1836)
This commit is contained in:
@@ -104,7 +104,8 @@ class KokoroMultiLangLexicon::Impl {
|
|||||||
// https://en.cppreference.com/w/cpp/regex
|
// https://en.cppreference.com/w/cpp/regex
|
||||||
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
|
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
|
||||||
std::string expr =
|
std::string expr =
|
||||||
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
|
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
|
||||||
|
")";
|
||||||
|
|
||||||
auto ws = ToWideString(text);
|
auto ws = ToWideString(text);
|
||||||
std::wstring wexpr = ToWideString(expr);
|
std::wstring wexpr = ToWideString(expr);
|
||||||
@@ -127,7 +128,7 @@ class KokoroMultiLangLexicon::Impl {
|
|||||||
if (debug_) {
|
if (debug_) {
|
||||||
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||||||
}
|
}
|
||||||
ids_vec = ConvertEnglishToTokenIDs(ms);
|
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
||||||
} else {
|
} else {
|
||||||
if (debug_) {
|
if (debug_) {
|
||||||
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
|
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
|
||||||
@@ -257,7 +258,7 @@ class KokoroMultiLangLexicon::Impl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
|
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
|
||||||
const std::string &text) const {
|
const std::string &text, const std::string &voice) const {
|
||||||
std::vector<std::string> words = SplitUtf8(text);
|
std::vector<std::string> words = SplitUtf8(text);
|
||||||
if (debug_) {
|
if (debug_) {
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
@@ -315,7 +316,7 @@ class KokoroMultiLangLexicon::Impl {
|
|||||||
|
|
||||||
piper::eSpeakPhonemeConfig config;
|
piper::eSpeakPhonemeConfig config;
|
||||||
|
|
||||||
config.voice = "en-us";
|
config.voice = voice;
|
||||||
|
|
||||||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||||
|
|
||||||
|
|||||||
@@ -221,7 +221,7 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<TokenIDs> token_ids =
|
std::vector<TokenIDs> token_ids =
|
||||||
frontend_->ConvertTextToTokenIds(text, "en-us");
|
frontend_->ConvertTextToTokenIds(text, meta_data.voice);
|
||||||
|
|
||||||
if (token_ids.empty() ||
|
if (token_ids.empty() ||
|
||||||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
|
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ struct OfflineTtsKokoroModelMetaData {
|
|||||||
int32_t version = 1;
|
int32_t version = 1;
|
||||||
int32_t has_espeak = 1;
|
int32_t has_espeak = 1;
|
||||||
int32_t max_token_len = 0;
|
int32_t max_token_len = 0;
|
||||||
|
|
||||||
|
std::string voice;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|||||||
@@ -138,6 +138,8 @@ class OfflineTtsKokoroModel::Impl {
|
|||||||
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
|
SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1);
|
||||||
SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
|
SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers");
|
||||||
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
|
SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak");
|
||||||
|
SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.voice, "voice",
|
||||||
|
"en-us");
|
||||||
|
|
||||||
if (config_.debug) {
|
if (config_.debug) {
|
||||||
std::vector<std::string> speaker_names;
|
std::vector<std::string> speaker_names;
|
||||||
|
|||||||
Reference in New Issue
Block a user