Fix spliting text by languages for kokoro tts. (#1849)

This commit is contained in:
Fangjun Kuang
2025-02-13 18:19:34 +08:00
committed by GitHub
parent 115e9c2247
commit 944400e399
7 changed files with 204 additions and 36 deletions

View File

@@ -4,9 +4,7 @@
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
#include <codecvt>
#include <fstream>
#include <locale>
#include <regex> // NOLINT
#include <sstream>
#include <strstream>
@@ -22,6 +20,8 @@
#include "rawfile/raw_file_manager.h"
#endif
#include <codecvt>
#include "cppjieba/Jieba.hpp"
#include "espeak-ng/speak_lib.h"
#include "phoneme_ids.hpp"
@@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text,
piper::eSpeakPhonemeConfig &config, // NOLINT
std::vector<std::vector<piper::Phoneme>> *phonemes);
static std::wstring ToWideString(const std::string &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(s);
}
static std::string ToString(const std::wstring &s) {
// see
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(s);
}
class KokoroMultiLangLexicon::Impl {
public:
Impl(const std::string &tokens, const std::string &lexicon,
@@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl {
// https://en.cppreference.com/w/cpp/regex
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
std::string expr =
"([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
")";
std::string expr_chinese = "([\\u4e00-\\u9fff]+)";
std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)";
std::string expr_both = expr_chinese + "|" + expr_not_chinese;
auto ws = ToWideString(text);
std::wstring wexpr = ToWideString(expr);
std::wregex we(wexpr);
std::wstring wexpr_both = ToWideString(expr_both);
std::wregex we_both(wexpr_both);
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
std::wstring wexpr_zh = ToWideString(expr_chinese);
std::wregex we_zh(wexpr_zh);
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both);
auto end = std::wsregex_iterator();
std::vector<TokenIDs> ans;
@@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl {
for (std::wsregex_iterator i = begin; i != end; ++i) {
std::wsmatch match = *i;
std::wstring match_str = match.str();
auto ms = ToString(match_str);
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
std::vector<std::vector<int32_t>> ids_vec;
if (c < 0x80) {
if (debug_) {
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
}
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
} else {
if (std::regex_match(match_str, we_zh)) {
if (debug_) {
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
}
ids_vec = ConvertChineseToTokenIDs(ms);
} else {
if (debug_) {
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
}
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
}
for (const auto &ids : ids_vec) {
@@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl {
this_sentence.push_back(space_id);
} else {
if (debug_) {
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'",
word.c_str());
}
piper::eSpeakPhonemeConfig config;
config.voice = voice;