Fix spliting text by languages for kokoro tts. (#1849)
This commit is contained in:
@@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const {
|
||||
return OfflineStream{s};
|
||||
}
|
||||
|
||||
OfflineStream OfflineRecognizer::CreateStream(const std::string &hotwords) const {
|
||||
OfflineStream OfflineRecognizer::CreateStream(
|
||||
const std::string &hotwords) const {
|
||||
auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str());
|
||||
return OfflineStream{s};
|
||||
}
|
||||
|
||||
@@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
|
||||
context-graph-test.cc
|
||||
packed-sequence-test.cc
|
||||
pad-sequence-test.cc
|
||||
regex-lang-test.cc
|
||||
slice-test.cc
|
||||
stack-test.cc
|
||||
text-utils-test.cc
|
||||
|
||||
@@ -4,9 +4,7 @@
|
||||
|
||||
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
|
||||
|
||||
#include <codecvt>
|
||||
#include <fstream>
|
||||
#include <locale>
|
||||
#include <regex> // NOLINT
|
||||
#include <sstream>
|
||||
#include <strstream>
|
||||
@@ -22,6 +20,8 @@
|
||||
#include "rawfile/raw_file_manager.h"
|
||||
#endif
|
||||
|
||||
#include <codecvt>
|
||||
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "espeak-ng/speak_lib.h"
|
||||
#include "phoneme_ids.hpp"
|
||||
@@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text,
|
||||
piper::eSpeakPhonemeConfig &config, // NOLINT
|
||||
std::vector<std::vector<piper::Phoneme>> *phonemes);
|
||||
|
||||
static std::wstring ToWideString(const std::string &s) {
|
||||
// see
|
||||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.from_bytes(s);
|
||||
}
|
||||
|
||||
static std::string ToString(const std::wstring &s) {
|
||||
// see
|
||||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.to_bytes(s);
|
||||
}
|
||||
|
||||
class KokoroMultiLangLexicon::Impl {
|
||||
public:
|
||||
Impl(const std::string &tokens, const std::string &lexicon,
|
||||
@@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl {
|
||||
|
||||
// https://en.cppreference.com/w/cpp/regex
|
||||
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
|
||||
std::string expr =
|
||||
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
|
||||
")";
|
||||
std::string expr_chinese = "([\\u4e00-\\u9fff]+)";
|
||||
std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)";
|
||||
|
||||
std::string expr_both = expr_chinese + "|" + expr_not_chinese;
|
||||
|
||||
auto ws = ToWideString(text);
|
||||
std::wstring wexpr = ToWideString(expr);
|
||||
std::wregex we(wexpr);
|
||||
std::wstring wexpr_both = ToWideString(expr_both);
|
||||
std::wregex we_both(wexpr_both);
|
||||
|
||||
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
|
||||
std::wstring wexpr_zh = ToWideString(expr_chinese);
|
||||
std::wregex we_zh(wexpr_zh);
|
||||
|
||||
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both);
|
||||
auto end = std::wsregex_iterator();
|
||||
|
||||
std::vector<TokenIDs> ans;
|
||||
@@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl {
|
||||
for (std::wsregex_iterator i = begin; i != end; ++i) {
|
||||
std::wsmatch match = *i;
|
||||
std::wstring match_str = match.str();
|
||||
|
||||
auto ms = ToString(match_str);
|
||||
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
|
||||
|
||||
std::vector<std::vector<int32_t>> ids_vec;
|
||||
|
||||
if (c < 0x80) {
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||||
}
|
||||
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
||||
} else {
|
||||
if (std::regex_match(match_str, we_zh)) {
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
|
||||
}
|
||||
ids_vec = ConvertChineseToTokenIDs(ms);
|
||||
} else {
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||||
}
|
||||
|
||||
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
||||
}
|
||||
|
||||
for (const auto &ids : ids_vec) {
|
||||
@@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl {
|
||||
this_sentence.push_back(space_id);
|
||||
} else {
|
||||
if (debug_) {
|
||||
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
|
||||
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'",
|
||||
word.c_str());
|
||||
}
|
||||
|
||||
|
||||
piper::eSpeakPhonemeConfig config;
|
||||
|
||||
config.voice = voice;
|
||||
|
||||
86
sherpa-onnx/csrc/regex-lang-test.cc
Normal file
86
sherpa-onnx/csrc/regex-lang-test.cc
Normal file
@@ -0,0 +1,86 @@
|
||||
// sherpa-onnx/csrc/regex-lang-test.cc
|
||||
//
|
||||
// Copyright (c) 2025 Xiaomi Corporation
|
||||
|
||||
#include <regex> // NOLINT
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.cc"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
static void TestLang(const std::string &expr, const std::string &text,
|
||||
const std::vector<std::string> &expected) {
|
||||
auto ws = ToWideString(text);
|
||||
std::wstring wexpr = ToWideString(expr);
|
||||
std::wregex we(wexpr);
|
||||
|
||||
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
|
||||
auto end = std::wsregex_iterator();
|
||||
int32_t k = 0;
|
||||
for (std::wsregex_iterator i = begin; i != end; ++i) {
|
||||
std::wsmatch match = *i;
|
||||
std::wstring match_str = match.str();
|
||||
auto ms = ToString(match_str);
|
||||
std::cout << ms << "\n";
|
||||
EXPECT_EQ(ms, expected[k]);
|
||||
k++;
|
||||
}
|
||||
EXPECT_EQ(k, expected.size());
|
||||
}
|
||||
|
||||
TEST(German, Case1) {
|
||||
std::cout << "----------Test German----------";
|
||||
// see https://character-table.netlify.app/german/
|
||||
std::string expr =
|
||||
"([\\u0020-\\u005f\\u0061-"
|
||||
"\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df\\"
|
||||
"u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-"
|
||||
"\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)";
|
||||
|
||||
std::string text =
|
||||
"开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€";
|
||||
|
||||
std::vector<std::string> expected = {"Übeltäter übergibt Ärzten ",
|
||||
"öfters äußerst ätzende Öle", "3€"};
|
||||
|
||||
TestLang(expr, text, expected);
|
||||
}
|
||||
|
||||
TEST(French, Case1) {
|
||||
std::string expr =
|
||||
"([\\u0020-\\u005f\\u0061-"
|
||||
"\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-"
|
||||
"\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-"
|
||||
"\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-"
|
||||
"\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-"
|
||||
"\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-"
|
||||
"\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-"
|
||||
"\\u2030\\u20ac\\u2212]+)";
|
||||
std::string text =
|
||||
"L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon";
|
||||
std::vector<std::string> expected = {
|
||||
"L'été, ",
|
||||
"avec son ciel bleuâtre, ",
|
||||
"est un moment où, ",
|
||||
"Noël, maçon",
|
||||
};
|
||||
TestLang(expr, text, expected);
|
||||
}
|
||||
|
||||
TEST(English, Case1) {
|
||||
// https://character-table.netlify.app/english/
|
||||
std::string expr =
|
||||
"([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-"
|
||||
"\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-"
|
||||
"\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)";
|
||||
std::string text = "一how are you doing? 二Thank you!";
|
||||
|
||||
std::vector<std::string> expected = {
|
||||
"how are you doing? ",
|
||||
"Thank you!",
|
||||
};
|
||||
TestLang(expr, text, expected);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
@@ -8,6 +8,14 @@
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
TEST(ToLowerCase, WideString) {
|
||||
std::string text =
|
||||
"Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€";
|
||||
auto t = ToLowerCase(text);
|
||||
std::cout << text << "\n";
|
||||
std::cout << t << "\n";
|
||||
}
|
||||
|
||||
TEST(RemoveInvalidUtf8Sequences, Case1) {
|
||||
std::vector<uint8_t> v = {
|
||||
0xe4, 0xbb, 0x8a, // 今
|
||||
|
||||
@@ -8,8 +8,11 @@
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <codecvt>
|
||||
#include <cstdint>
|
||||
#include <cwctype>
|
||||
#include <limits>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
@@ -389,10 +392,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
|
||||
}
|
||||
|
||||
std::string ToLowerCase(const std::string &s) {
|
||||
std::string ans(s.size(), 0);
|
||||
std::transform(s.begin(), s.end(), ans.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
return ans;
|
||||
return ToString(ToLowerCase(ToWideString(s)));
|
||||
}
|
||||
|
||||
void ToLowerCase(std::string *in_out) {
|
||||
@@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) {
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
}
|
||||
|
||||
std::wstring ToLowerCase(const std::wstring &s) {
|
||||
std::wstring ans(s.size(), 0);
|
||||
std::transform(s.begin(), s.end(), ans.begin(), [](wchar_t c) -> wchar_t {
|
||||
switch (c) {
|
||||
// French
|
||||
case L'À':
|
||||
return L'à';
|
||||
case L'Â':
|
||||
return L'â';
|
||||
case L'Æ':
|
||||
return L'æ';
|
||||
case L'Ç':
|
||||
return L'ç';
|
||||
case L'È':
|
||||
return L'è';
|
||||
case L'É':
|
||||
return L'é';
|
||||
case L'Ë':
|
||||
return L'ë';
|
||||
case L'Î':
|
||||
return L'î';
|
||||
case L'Ï':
|
||||
return L'ï';
|
||||
case L'Ô':
|
||||
return L'ô';
|
||||
case L'Ù':
|
||||
return L'ù';
|
||||
case L'Û':
|
||||
return L'û';
|
||||
case L'Ü':
|
||||
return L'ü';
|
||||
|
||||
// others
|
||||
case L'Á':
|
||||
return L'á';
|
||||
case L'Í':
|
||||
return L'í';
|
||||
case L'Ó':
|
||||
return L'ó';
|
||||
case L'Ú':
|
||||
return L'ú';
|
||||
case L'Ñ':
|
||||
return L'ñ';
|
||||
case L'Ì':
|
||||
return L'ì';
|
||||
case L'Ò':
|
||||
return L'ò';
|
||||
case L'Ä':
|
||||
return L'ä';
|
||||
case L'Ö':
|
||||
return L'ö';
|
||||
// TODO(fangjun): Add more
|
||||
|
||||
default:
|
||||
return std::towlower(c);
|
||||
}
|
||||
});
|
||||
return ans;
|
||||
}
|
||||
|
||||
static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) {
|
||||
return low <= x && x <= high;
|
||||
}
|
||||
@@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) {
|
||||
}
|
||||
#endif
|
||||
|
||||
std::wstring ToWideString(const std::string &s) {
|
||||
// see
|
||||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.from_bytes(s);
|
||||
}
|
||||
|
||||
std::string ToString(const std::wstring &s) {
|
||||
// see
|
||||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.to_bytes(s);
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text);
|
||||
std::string ToLowerCase(const std::string &s);
|
||||
void ToLowerCase(std::string *in_out);
|
||||
|
||||
std::wstring ToLowerCase(const std::wstring &s);
|
||||
|
||||
std::string RemoveInvalidUtf8Sequences(const std::string &text,
|
||||
bool show_debug_msg = false);
|
||||
|
||||
@@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text);
|
||||
std::string Gb2312ToUtf8(const std::string &text);
|
||||
#endif
|
||||
|
||||
std::wstring ToWideString(const std::string &s);
|
||||
|
||||
std::string ToString(const std::wstring &s);
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
|
||||
|
||||
Reference in New Issue
Block a user