Support extra languages in multi-lang kokoro tts (#2303)
This commit is contained in:
@@ -67,7 +67,8 @@ class KokoroMultiLangLexicon::Impl {
|
||||
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
|
||||
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text,
|
||||
const std::string &voice) const {
|
||||
std::string text = ToLowerCase(_text);
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
|
||||
@@ -124,7 +125,7 @@ class KokoroMultiLangLexicon::Impl {
|
||||
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||||
}
|
||||
|
||||
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
||||
ids_vec = ConvertNonChineseToTokenIDs(ms, voice);
|
||||
}
|
||||
|
||||
for (const auto &ids : ids_vec) {
|
||||
@@ -255,8 +256,30 @@ class KokoroMultiLangLexicon::Impl {
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
|
||||
std::vector<std::vector<int32_t>> ConvertTextToTokenIDsWithEspeak(
|
||||
const std::string &text, const std::string &voice) const {
|
||||
auto temp = ConvertTextToTokenIdsKokoro(
|
||||
phoneme2id_, meta_data_.max_token_len, text, voice);
|
||||
std::vector<std::vector<int32_t>> ans;
|
||||
ans.reserve(temp.size());
|
||||
|
||||
for (const auto &i : temp) {
|
||||
ans.emplace_back(i.tokens.begin(), i.tokens.end());
|
||||
}
|
||||
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<std::vector<int32_t>> ConvertNonChineseToTokenIDs(
|
||||
const std::string &text, const std::string &voice) const {
|
||||
if (!voice.empty()) {
|
||||
return ConvertTextToTokenIDsWithEspeak(text, voice);
|
||||
}
|
||||
|
||||
// If voice is empty, we split the text into words and use the lexicon
|
||||
// to lookup the pronunciation of each word, fallback to espeak if
|
||||
// a word is not in the lexicon.
|
||||
|
||||
std::vector<std::string> words = SplitUtf8(text);
|
||||
if (debug_) {
|
||||
std::ostringstream os;
|
||||
@@ -317,7 +340,7 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
config.voice = voice;
|
||||
config.voice = meta_data_.voice;
|
||||
|
||||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||||
|
||||
@@ -391,9 +414,28 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
void InitTokens(std::istream &is) {
|
||||
token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
|
||||
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
|
||||
std::u32string s;
|
||||
for (const auto &p : token2id_) {
|
||||
s = conv.from_bytes(p.first);
|
||||
|
||||
if (s.size() != 1) {
|
||||
SHERPA_ONNX_LOGE("Error for token %s with id %d", p.first.c_str(),
|
||||
p.second);
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
char32_t c = s[0];
|
||||
phoneme2id_.insert({c, p.second});
|
||||
}
|
||||
}
|
||||
|
||||
void InitLexicon(const std::string &lexicon) {
|
||||
if (lexicon.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(lexicon, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
@@ -404,6 +446,10 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
template <typename Manager>
|
||||
void InitLexicon(Manager *mgr, const std::string &lexicon) {
|
||||
if (lexicon.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(lexicon, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
@@ -445,7 +491,7 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
|
||||
|
||||
if (ids.empty()) {
|
||||
if (ids.empty() && word != "呣") {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
|
||||
word.c_str(), line_num, line.c_str());
|
||||
@@ -465,6 +511,8 @@ class KokoroMultiLangLexicon::Impl {
|
||||
// tokens.txt is saved in token2id_
|
||||
std::unordered_map<std::string, int32_t> token2id_;
|
||||
|
||||
std::unordered_map<char32_t, int32_t> phoneme2id_;
|
||||
|
||||
std::unique_ptr<cppjieba::Jieba> jieba_;
|
||||
bool debug_ = false;
|
||||
};
|
||||
@@ -487,8 +535,8 @@ KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||||
meta_data, debug)) {}
|
||||
|
||||
std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
|
||||
const std::string &text, const std::string & /*unused_voice = ""*/) const {
|
||||
return impl_->ConvertTextToTokenIds(text);
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
return impl_->ConvertTextToTokenIds(text, voice);
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
|
||||
@@ -20,9 +20,9 @@ struct OfflineSpeechDenoiserModelConfig {
|
||||
|
||||
OfflineSpeechDenoiserModelConfig() = default;
|
||||
|
||||
OfflineSpeechDenoiserModelConfig(OfflineSpeechDenoiserGtcrnModelConfig gtcrn,
|
||||
int32_t num_threads, bool debug,
|
||||
const std::string &provider)
|
||||
OfflineSpeechDenoiserModelConfig(
|
||||
const OfflineSpeechDenoiserGtcrnModelConfig >crn, int32_t num_threads,
|
||||
bool debug, const std::string &provider)
|
||||
: gtcrn(gtcrn),
|
||||
num_threads(num_threads),
|
||||
debug(debug),
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@@ -57,6 +58,12 @@ class OfflineTtsFrontend {
|
||||
// implementation is in ./piper-phonemize-lexicon.cc
|
||||
void InitEspeak(const std::string &data_dir);
|
||||
|
||||
// implementation in ./piper-phonemize-lexicon.cc
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
int32_t max_token_len, const std::string &text,
|
||||
const std::string &voice = "");
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
|
||||
|
||||
@@ -220,8 +220,9 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> token_ids =
|
||||
frontend_->ConvertTextToTokenIds(text, meta_data.voice);
|
||||
std::vector<TokenIDs> token_ids = frontend_->ConvertTextToTokenIds(
|
||||
text, config_.model.kokoro.lang.empty() ? meta_data.voice
|
||||
: config_.model.kokoro.lang);
|
||||
|
||||
if (token_ids.empty() ||
|
||||
(token_ids.size() == 1 && token_ids[0].tokens.empty())) {
|
||||
@@ -335,12 +336,14 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
if (meta_data.version >= 2) {
|
||||
// this is a multi-lingual model, we require that you pass lexicon
|
||||
// and dict_dir
|
||||
if (config_.model.kokoro.lexicon.empty() ||
|
||||
if ((config_.model.kokoro.lexicon.empty() &&
|
||||
config_.model.kokoro.lang.empty()) ||
|
||||
config_.model.kokoro.dict_dir.empty()) {
|
||||
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
|
||||
SHERPA_ONNX_LOGE(
|
||||
"You are using a multi-lingual Kokoro model (e.g., Kokoro >= "
|
||||
"v1.0). please pass --kokoro-lexicon and --kokoro-dict-dir");
|
||||
"v1.0). Please pass --kokoro-lexicon and --kokoro-dict-dir or "
|
||||
"provide --kokoro-lang and --kokoro-dict-dir");
|
||||
SHERPA_ONNX_EXIT(-1);
|
||||
}
|
||||
|
||||
@@ -362,7 +365,8 @@ class OfflineTtsKokoroImpl : public OfflineTtsImpl {
|
||||
if (meta_data.version >= 2) {
|
||||
// this is a multi-lingual model, we require that you pass lexicon
|
||||
// and dict_dir
|
||||
if (config_.model.kokoro.lexicon.empty() ||
|
||||
if ((config_.model.kokoro.lexicon.empty() &&
|
||||
config_.model.kokoro.lang.empty()) ||
|
||||
config_.model.kokoro.dict_dir.empty()) {
|
||||
SHERPA_ONNX_LOGE("Current model version: '%d'", meta_data.version);
|
||||
SHERPA_ONNX_LOGE(
|
||||
|
||||
@@ -18,6 +18,13 @@ void OfflineTtsKokoroModelConfig::Register(ParseOptions *po) {
|
||||
"Path to voices.bin for Kokoro models");
|
||||
po->Register("kokoro-tokens", &tokens,
|
||||
"Path to tokens.txt for Kokoro models");
|
||||
po->Register("kokoro-lang", &lang,
|
||||
"Used only by kokoro >= 1.0. Example values: "
|
||||
"en (English), "
|
||||
"es (Spanish), fr (French), hi (hindi), it (Italian), "
|
||||
"pt-br (Brazilian Portuguese)."
|
||||
"You can leave it empty, in which case you need to provide "
|
||||
"--kokoro-lexicon.");
|
||||
po->Register(
|
||||
"kokoro-lexicon", &lexicon,
|
||||
"Path to lexicon.txt for Kokoro models. Used only for Kokoro >= v1.0"
|
||||
@@ -127,7 +134,8 @@ std::string OfflineTtsKokoroModelConfig::ToString() const {
|
||||
os << "lexicon=\"" << lexicon << "\", ";
|
||||
os << "data_dir=\"" << data_dir << "\", ";
|
||||
os << "dict_dir=\"" << dict_dir << "\", ";
|
||||
os << "length_scale=" << length_scale << ")";
|
||||
os << "length_scale=" << length_scale << ", ";
|
||||
os << "lang=\"" << lang << "\")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
@@ -27,6 +27,13 @@ struct OfflineTtsKokoroModelConfig {
|
||||
// speed = 1 / length_scale
|
||||
float length_scale = 1.0;
|
||||
|
||||
// Used only for Kokoro >= 1.0.
|
||||
//
|
||||
// If it is not empty, meta_data.voice is ignored.
|
||||
// Example values: es (Spanish), fr (French), pt (Portuguese)
|
||||
// See https://hf-mirror.com/hexgrad/Kokoro-82M/blob/main/VOICES.md
|
||||
std::string lang;
|
||||
|
||||
OfflineTtsKokoroModelConfig() = default;
|
||||
|
||||
OfflineTtsKokoroModelConfig(const std::string &model,
|
||||
@@ -34,14 +41,16 @@ struct OfflineTtsKokoroModelConfig {
|
||||
const std::string &tokens,
|
||||
const std::string &lexicon,
|
||||
const std::string &data_dir,
|
||||
const std::string &dict_dir, float length_scale)
|
||||
const std::string &dict_dir, float length_scale,
|
||||
const std::string &lang)
|
||||
: model(model),
|
||||
voices(voices),
|
||||
tokens(tokens),
|
||||
lexicon(lexicon),
|
||||
data_dir(data_dir),
|
||||
dict_dir(dict_dir),
|
||||
length_scale(length_scale) {}
|
||||
length_scale(length_scale),
|
||||
lang(lang) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
@@ -351,7 +351,8 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIds(
|
||||
if (is_matcha_) {
|
||||
return ConvertTextToTokenIdsMatcha(text, voice);
|
||||
} else if (is_kokoro_) {
|
||||
return ConvertTextToTokenIdsKokoro(text, voice);
|
||||
return ConvertTextToTokenIdsKokoro(
|
||||
token2id_, kokoro_meta_data_.max_token_len, text, voice);
|
||||
} else {
|
||||
return ConvertTextToTokenIdsVits(text, voice);
|
||||
}
|
||||
@@ -382,8 +383,10 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha(
|
||||
return ans;
|
||||
}
|
||||
|
||||
std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
|
||||
const std::string &text, const std::string &voice /*= ""*/) const {
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
|
||||
const std::unordered_map<char32_t, int32_t> &token2id,
|
||||
int32_t max_token_len, const std::string &text,
|
||||
const std::string &voice /*= ""*/) {
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
// ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices
|
||||
@@ -397,8 +400,7 @@ std::vector<TokenIDs> PiperPhonemizeLexicon::ConvertTextToTokenIdsKokoro(
|
||||
std::vector<TokenIDs> ans;
|
||||
|
||||
for (const auto &p : phonemes) {
|
||||
auto phoneme_ids =
|
||||
PiperPhonemesToIdsKokoro(token2id_, p, kokoro_meta_data_.max_token_len);
|
||||
auto phoneme_ids = PiperPhonemesToIdsKokoro(token2id, p, max_token_len);
|
||||
|
||||
for (auto &ids : phoneme_ids) {
|
||||
ans.emplace_back(std::move(ids));
|
||||
|
||||
@@ -52,9 +52,6 @@ class PiperPhonemizeLexicon : public OfflineTtsFrontend {
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsMatcha(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
std::vector<TokenIDs> ConvertTextToTokenIdsKokoro(
|
||||
const std::string &text, const std::string &voice = "") const;
|
||||
|
||||
private:
|
||||
// map unicode codepoint to an integer ID
|
||||
std::unordered_map<char32_t, int32_t> token2id_;
|
||||
|
||||
Reference in New Issue
Block a user