524 lines
15 KiB
C++
524 lines
15 KiB
C++
// sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
|
||
//
|
||
// Copyright (c) 2025 Xiaomi Corporation
|
||
|
||
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
|
||
|
||
#include <codecvt>
|
||
#include <fstream>
|
||
#include <locale>
|
||
#include <regex> // NOLINT
|
||
#include <sstream>
|
||
#include <strstream>
|
||
#include <unordered_map>
|
||
#include <utility>
|
||
|
||
#if __ANDROID_API__ >= 9
|
||
#include "android/asset_manager.h"
|
||
#include "android/asset_manager_jni.h"
|
||
#endif
|
||
|
||
#if __OHOS__
|
||
#include "rawfile/raw_file_manager.h"
|
||
#endif
|
||
|
||
#include "cppjieba/Jieba.hpp"
|
||
#include "espeak-ng/speak_lib.h"
|
||
#include "phoneme_ids.hpp"
|
||
#include "phonemize.hpp"
|
||
#include "sherpa-onnx/csrc/file-utils.h"
|
||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||
#include "sherpa-onnx/csrc/symbol-table.h"
|
||
#include "sherpa-onnx/csrc/text-utils.h"
|
||
|
||
namespace sherpa_onnx {
|
||
|
||
void CallPhonemizeEspeak(const std::string &text,
|
||
piper::eSpeakPhonemeConfig &config, // NOLINT
|
||
std::vector<std::vector<piper::Phoneme>> *phonemes);
|
||
|
||
static std::wstring ToWideString(const std::string &s) {
|
||
// see
|
||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||
return converter.from_bytes(s);
|
||
}
|
||
|
||
static std::string ToString(const std::wstring &s) {
|
||
// see
|
||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||
return converter.to_bytes(s);
|
||
}
|
||
|
||
class KokoroMultiLangLexicon::Impl {
|
||
public:
|
||
Impl(const std::string &tokens, const std::string &lexicon,
|
||
const std::string &dict_dir, const std::string &data_dir,
|
||
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||
: meta_data_(meta_data), debug_(debug) {
|
||
InitTokens(tokens);
|
||
|
||
InitLexicon(lexicon);
|
||
|
||
InitJieba(dict_dir);
|
||
|
||
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
|
||
}
|
||
|
||
template <typename Manager>
|
||
Impl(Manager *mgr, const std::string &tokens, const std::string &lexicon,
|
||
const std::string &dict_dir, const std::string &data_dir,
|
||
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||
: meta_data_(meta_data), debug_(debug) {
|
||
InitTokens(mgr, tokens);
|
||
|
||
InitLexicon(mgr, lexicon);
|
||
|
||
// we assume you have copied dict_dir and data_dir from assets to some path
|
||
InitJieba(dict_dir);
|
||
|
||
InitEspeak(data_dir); // See ./piper-phonemize-lexicon.cc
|
||
}
|
||
|
||
std::vector<TokenIDs> ConvertTextToTokenIds(const std::string &_text) const {
|
||
std::string text = ToLowerCase(_text);
|
||
if (debug_) {
|
||
SHERPA_ONNX_LOGE("After converting to lowercase:\n%s", text.c_str());
|
||
}
|
||
|
||
std::vector<std::pair<std::string, std::string>> replace_str_pairs = {
|
||
{",", ","}, {":", ","}, {"、", ","}, {";", ";"}, {":", ":"},
|
||
{"。", "."}, {"?", "?"}, {"!", "!"}, {"\\s+", " "},
|
||
};
|
||
for (const auto &p : replace_str_pairs) {
|
||
std::regex re(p.first);
|
||
text = std::regex_replace(text, re, p.second);
|
||
}
|
||
|
||
if (debug_) {
|
||
SHERPA_ONNX_LOGE("After replacing punctuations and merging spaces:\n%s",
|
||
text.c_str());
|
||
}
|
||
|
||
// https://en.cppreference.com/w/cpp/regex
|
||
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
|
||
std::string expr =
|
||
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
|
||
")";
|
||
|
||
auto ws = ToWideString(text);
|
||
std::wstring wexpr = ToWideString(expr);
|
||
std::wregex we(wexpr);
|
||
|
||
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
|
||
auto end = std::wsregex_iterator();
|
||
|
||
std::vector<TokenIDs> ans;
|
||
|
||
for (std::wsregex_iterator i = begin; i != end; ++i) {
|
||
std::wsmatch match = *i;
|
||
std::wstring match_str = match.str();
|
||
auto ms = ToString(match_str);
|
||
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
|
||
|
||
std::vector<std::vector<int32_t>> ids_vec;
|
||
|
||
if (c < 0x80) {
|
||
if (debug_) {
|
||
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||
}
|
||
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
||
} else {
|
||
if (debug_) {
|
||
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
|
||
}
|
||
ids_vec = ConvertChineseToTokenIDs(ms);
|
||
}
|
||
|
||
for (const auto &ids : ids_vec) {
|
||
if (ids.size() > 4) {
|
||
ans.emplace_back(ids);
|
||
} else {
|
||
if (ans.empty()) {
|
||
ans.emplace_back(ids);
|
||
} else {
|
||
ans.back().tokens.back() = ids[1];
|
||
ans.back().tokens.insert(ans.back().tokens.end(), ids.begin() + 2,
|
||
ids.end());
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
if (debug_) {
|
||
for (const auto &v : ans) {
|
||
std::ostringstream os;
|
||
os << "\n";
|
||
std::string sep;
|
||
for (auto i : v.tokens) {
|
||
os << sep << i;
|
||
sep = " ";
|
||
}
|
||
os << "\n";
|
||
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||
}
|
||
}
|
||
|
||
return ans;
|
||
}
|
||
|
||
private:
|
||
bool IsPunctuation(const std::string &text) const {
|
||
if (text == ";" || text == ":" || text == "," || text == "." ||
|
||
text == "!" || text == "?" || text == "—" || text == "…" ||
|
||
text == "\"" || text == "(" || text == ")" || text == "“" ||
|
||
text == "”") {
|
||
return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
std::vector<int32_t> ConvertWordToIds(const std::string &w) const {
|
||
std::vector<int32_t> ans;
|
||
if (word2ids_.count(w)) {
|
||
ans = word2ids_.at(w);
|
||
return ans;
|
||
}
|
||
|
||
std::vector<std::string> words = SplitUtf8(w);
|
||
for (const auto &word : words) {
|
||
if (word2ids_.count(word)) {
|
||
auto ids = ConvertWordToIds(word);
|
||
ans.insert(ans.end(), ids.begin(), ids.end());
|
||
} else {
|
||
SHERPA_ONNX_LOGE("Skip OOV: '%s'", word.c_str());
|
||
}
|
||
}
|
||
|
||
return ans;
|
||
}
|
||
|
||
std::vector<std::vector<int32_t>> ConvertChineseToTokenIDs(
|
||
const std::string &text) const {
|
||
bool is_hmm = true;
|
||
|
||
std::vector<std::string> words;
|
||
jieba_->Cut(text, words, is_hmm);
|
||
if (debug_) {
|
||
std::ostringstream os;
|
||
os << "After jieba processing:\n";
|
||
|
||
std::string sep;
|
||
for (const auto &w : words) {
|
||
os << sep << w;
|
||
sep = "_";
|
||
}
|
||
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||
}
|
||
|
||
std::vector<std::vector<int32_t>> ans;
|
||
std::vector<int32_t> this_sentence;
|
||
int32_t max_len = meta_data_.max_token_len;
|
||
|
||
this_sentence.push_back(0);
|
||
for (const auto &w : words) {
|
||
auto ids = ConvertWordToIds(w);
|
||
if (this_sentence.size() + ids.size() > max_len - 2) {
|
||
this_sentence.push_back(0);
|
||
ans.push_back(std::move(this_sentence));
|
||
|
||
this_sentence.push_back(0);
|
||
}
|
||
|
||
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
|
||
}
|
||
|
||
if (this_sentence.size() > 1) {
|
||
this_sentence.push_back(0);
|
||
ans.push_back(std::move(this_sentence));
|
||
}
|
||
|
||
if (debug_) {
|
||
for (const auto &v : ans) {
|
||
std::ostringstream os;
|
||
os << "\n";
|
||
std::string sep;
|
||
for (auto i : v) {
|
||
os << sep << i;
|
||
sep = " ";
|
||
}
|
||
os << "\n";
|
||
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||
}
|
||
}
|
||
|
||
return ans;
|
||
}
|
||
|
||
std::vector<std::vector<int32_t>> ConvertEnglishToTokenIDs(
|
||
const std::string &text, const std::string &voice) const {
|
||
std::vector<std::string> words = SplitUtf8(text);
|
||
if (debug_) {
|
||
std::ostringstream os;
|
||
os << "After splitting to words: ";
|
||
std::string sep;
|
||
for (const auto &w : words) {
|
||
os << sep << w;
|
||
sep = "_";
|
||
}
|
||
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||
}
|
||
|
||
std::vector<std::vector<int32_t>> ans;
|
||
int32_t max_len = meta_data_.max_token_len;
|
||
std::vector<int32_t> this_sentence;
|
||
|
||
int32_t space_id = token2id_.at(" ");
|
||
|
||
this_sentence.push_back(0);
|
||
|
||
for (const auto &word : words) {
|
||
if (IsPunctuation(word)) {
|
||
this_sentence.push_back(token2id_.at(word));
|
||
|
||
if (this_sentence.size() > max_len - 2) {
|
||
// this sentence is too long, split it
|
||
this_sentence.push_back(0);
|
||
ans.push_back(std::move(this_sentence));
|
||
|
||
this_sentence.push_back(0);
|
||
continue;
|
||
}
|
||
|
||
if (word == "." || word == "!" || word == "?" || word == ";") {
|
||
// Note: You can add more punctuations here to split the text
|
||
// into sentences. We just use four here: .!?;
|
||
this_sentence.push_back(0);
|
||
ans.push_back(std::move(this_sentence));
|
||
|
||
this_sentence.push_back(0);
|
||
}
|
||
} else if (word2ids_.count(word)) {
|
||
const auto &ids = word2ids_.at(word);
|
||
if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
|
||
this_sentence.push_back(0);
|
||
ans.push_back(std::move(this_sentence));
|
||
|
||
this_sentence.push_back(0);
|
||
}
|
||
|
||
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
|
||
this_sentence.push_back(space_id);
|
||
} else {
|
||
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
|
||
|
||
piper::eSpeakPhonemeConfig config;
|
||
|
||
config.voice = voice;
|
||
|
||
std::vector<std::vector<piper::Phoneme>> phonemes;
|
||
|
||
CallPhonemizeEspeak(word, config, &phonemes);
|
||
// Note phonemes[i] contains a vector of unicode codepoints;
|
||
// we need to convert them to utf8
|
||
|
||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
|
||
|
||
std::vector<int32_t> ids;
|
||
for (const auto &v : phonemes) {
|
||
for (const auto p : v) {
|
||
auto token = conv.to_bytes(p);
|
||
if (token2id_.count(token)) {
|
||
ids.push_back(token2id_.at(token));
|
||
} else {
|
||
SHERPA_ONNX_LOGE("Skip OOV token '%s' from '%s'", token.c_str(),
|
||
word.c_str());
|
||
}
|
||
}
|
||
}
|
||
|
||
if (this_sentence.size() + ids.size() + 3 > max_len - 2) {
|
||
this_sentence.push_back(0);
|
||
ans.push_back(std::move(this_sentence));
|
||
|
||
this_sentence.push_back(0);
|
||
}
|
||
|
||
this_sentence.insert(this_sentence.end(), ids.begin(), ids.end());
|
||
this_sentence.push_back(space_id);
|
||
}
|
||
}
|
||
|
||
if (this_sentence.size() > 1) {
|
||
this_sentence.push_back(0);
|
||
ans.push_back(std::move(this_sentence));
|
||
}
|
||
|
||
if (debug_) {
|
||
for (const auto &v : ans) {
|
||
std::ostringstream os;
|
||
os << "\n";
|
||
std::string sep;
|
||
for (auto i : v) {
|
||
os << sep << i;
|
||
sep = " ";
|
||
}
|
||
os << "\n";
|
||
SHERPA_ONNX_LOGE("%s", os.str().c_str());
|
||
}
|
||
}
|
||
|
||
return ans;
|
||
}
|
||
|
||
void InitTokens(const std::string &tokens) {
|
||
std::ifstream is(tokens);
|
||
InitTokens(is);
|
||
}
|
||
|
||
template <typename Manager>
|
||
void InitTokens(Manager *mgr, const std::string &tokens) {
|
||
auto buf = ReadFile(mgr, tokens);
|
||
|
||
std::istrstream is(buf.data(), buf.size());
|
||
InitTokens(is);
|
||
}
|
||
|
||
void InitTokens(std::istream &is) {
|
||
token2id_ = ReadTokens(is); // defined in ./symbol-table.cc
|
||
}
|
||
|
||
void InitLexicon(const std::string &lexicon) {
|
||
std::vector<std::string> files;
|
||
SplitStringToVector(lexicon, ",", false, &files);
|
||
for (const auto &f : files) {
|
||
std::ifstream is(f);
|
||
InitLexicon(is);
|
||
}
|
||
}
|
||
|
||
template <typename Manager>
|
||
void InitLexicon(Manager *mgr, const std::string &lexicon) {
|
||
std::vector<std::string> files;
|
||
SplitStringToVector(lexicon, ",", false, &files);
|
||
for (const auto &f : files) {
|
||
auto buf = ReadFile(mgr, f);
|
||
|
||
std::istrstream is(buf.data(), buf.size());
|
||
InitLexicon(is);
|
||
}
|
||
}
|
||
|
||
void InitLexicon(std::istream &is) {
|
||
std::string word;
|
||
std::vector<std::string> token_list;
|
||
std::string token;
|
||
|
||
std::string line;
|
||
int32_t line_num = 0;
|
||
int32_t num_warn = 0;
|
||
while (std::getline(is, line)) {
|
||
++line_num;
|
||
std::istringstream iss(line);
|
||
|
||
token_list.clear();
|
||
iss >> word;
|
||
ToLowerCase(&word);
|
||
|
||
if (word2ids_.count(word)) {
|
||
num_warn += 1;
|
||
if (num_warn < 10) {
|
||
SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.",
|
||
word.c_str(), line_num, line.c_str());
|
||
}
|
||
continue;
|
||
}
|
||
|
||
while (iss >> token) {
|
||
token_list.push_back(std::move(token));
|
||
}
|
||
|
||
std::vector<int32_t> ids = ConvertTokensToIds(token2id_, token_list);
|
||
|
||
if (ids.empty()) {
|
||
SHERPA_ONNX_LOGE(
|
||
"Invalid pronunciation for word '%s' at line %d:%s. Ignore it",
|
||
word.c_str(), line_num, line.c_str());
|
||
continue;
|
||
}
|
||
|
||
word2ids_.insert({std::move(word), std::move(ids)});
|
||
}
|
||
}
|
||
|
||
void InitJieba(const std::string &dict_dir) {
|
||
std::string dict = dict_dir + "/jieba.dict.utf8";
|
||
std::string hmm = dict_dir + "/hmm_model.utf8";
|
||
std::string user_dict = dict_dir + "/user.dict.utf8";
|
||
std::string idf = dict_dir + "/idf.utf8";
|
||
std::string stop_word = dict_dir + "/stop_words.utf8";
|
||
|
||
AssertFileExists(dict);
|
||
AssertFileExists(hmm);
|
||
AssertFileExists(user_dict);
|
||
AssertFileExists(idf);
|
||
AssertFileExists(stop_word);
|
||
|
||
jieba_ =
|
||
std::make_unique<cppjieba::Jieba>(dict, hmm, user_dict, idf, stop_word);
|
||
}
|
||
|
||
private:
|
||
OfflineTtsKokoroModelMetaData meta_data_;
|
||
|
||
// word to token IDs
|
||
std::unordered_map<std::string, std::vector<int32_t>> word2ids_;
|
||
|
||
// tokens.txt is saved in token2id_
|
||
std::unordered_map<std::string, int32_t> token2id_;
|
||
|
||
std::unique_ptr<cppjieba::Jieba> jieba_;
|
||
bool debug_ = false;
|
||
};
|
||
|
||
KokoroMultiLangLexicon::~KokoroMultiLangLexicon() = default;
|
||
|
||
KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||
const std::string &tokens, const std::string &lexicon,
|
||
const std::string &dict_dir, const std::string &data_dir,
|
||
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||
: impl_(std::make_unique<Impl>(tokens, lexicon, dict_dir, data_dir,
|
||
meta_data, debug)) {}
|
||
|
||
template <typename Manager>
|
||
KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||
Manager *mgr, const std::string &tokens, const std::string &lexicon,
|
||
const std::string &dict_dir, const std::string &data_dir,
|
||
const OfflineTtsKokoroModelMetaData &meta_data, bool debug)
|
||
: impl_(std::make_unique<Impl>(mgr, tokens, lexicon, dict_dir, data_dir,
|
||
meta_data, debug)) {}
|
||
|
||
std::vector<TokenIDs> KokoroMultiLangLexicon::ConvertTextToTokenIds(
|
||
const std::string &text, const std::string & /*unused_voice = ""*/) const {
|
||
return impl_->ConvertTextToTokenIds(text);
|
||
}
|
||
|
||
#if __ANDROID_API__ >= 9
|
||
template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||
AAssetManager *mgr, const std::string &tokens, const std::string &lexicon,
|
||
const std::string &dict_dir, const std::string &data_dir,
|
||
const OfflineTtsKokoroModelMetaData &meta_data, bool debug);
|
||
#endif
|
||
|
||
#if __OHOS__
|
||
template KokoroMultiLangLexicon::KokoroMultiLangLexicon(
|
||
NativeResourceManager *mgr, const std::string &tokens,
|
||
const std::string &lexicon, const std::string &dict_dir,
|
||
const std::string &data_dir, const OfflineTtsKokoroModelMetaData &meta_data,
|
||
bool debug);
|
||
#endif
|
||
|
||
} // namespace sherpa_onnx
|