Fix spliting text by languages for kokoro tts. (#1849)
This commit is contained in:
@@ -270,7 +270,8 @@ OfflineStream OfflineRecognizer::CreateStream() const {
|
|||||||
return OfflineStream{s};
|
return OfflineStream{s};
|
||||||
}
|
}
|
||||||
|
|
||||||
OfflineStream OfflineRecognizer::CreateStream(const std::string &hotwords) const {
|
OfflineStream OfflineRecognizer::CreateStream(
|
||||||
|
const std::string &hotwords) const {
|
||||||
auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str());
|
auto s = SherpaOnnxCreateOfflineStreamWithHotwords(p_, hotwords.c_str());
|
||||||
return OfflineStream{s};
|
return OfflineStream{s};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -549,6 +549,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
|
|||||||
context-graph-test.cc
|
context-graph-test.cc
|
||||||
packed-sequence-test.cc
|
packed-sequence-test.cc
|
||||||
pad-sequence-test.cc
|
pad-sequence-test.cc
|
||||||
|
regex-lang-test.cc
|
||||||
slice-test.cc
|
slice-test.cc
|
||||||
stack-test.cc
|
stack-test.cc
|
||||||
text-utils-test.cc
|
text-utils-test.cc
|
||||||
|
|||||||
@@ -4,9 +4,7 @@
|
|||||||
|
|
||||||
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
|
#include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"
|
||||||
|
|
||||||
#include <codecvt>
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <locale>
|
|
||||||
#include <regex> // NOLINT
|
#include <regex> // NOLINT
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <strstream>
|
#include <strstream>
|
||||||
@@ -22,6 +20,8 @@
|
|||||||
#include "rawfile/raw_file_manager.h"
|
#include "rawfile/raw_file_manager.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <codecvt>
|
||||||
|
|
||||||
#include "cppjieba/Jieba.hpp"
|
#include "cppjieba/Jieba.hpp"
|
||||||
#include "espeak-ng/speak_lib.h"
|
#include "espeak-ng/speak_lib.h"
|
||||||
#include "phoneme_ids.hpp"
|
#include "phoneme_ids.hpp"
|
||||||
@@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text,
|
|||||||
piper::eSpeakPhonemeConfig &config, // NOLINT
|
piper::eSpeakPhonemeConfig &config, // NOLINT
|
||||||
std::vector<std::vector<piper::Phoneme>> *phonemes);
|
std::vector<std::vector<piper::Phoneme>> *phonemes);
|
||||||
|
|
||||||
static std::wstring ToWideString(const std::string &s) {
|
|
||||||
// see
|
|
||||||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
|
||||||
return converter.from_bytes(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string ToString(const std::wstring &s) {
|
|
||||||
// see
|
|
||||||
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
|
||||||
return converter.to_bytes(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
class KokoroMultiLangLexicon::Impl {
|
class KokoroMultiLangLexicon::Impl {
|
||||||
public:
|
public:
|
||||||
Impl(const std::string &tokens, const std::string &lexicon,
|
Impl(const std::string &tokens, const std::string &lexicon,
|
||||||
@@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl {
|
|||||||
|
|
||||||
// https://en.cppreference.com/w/cpp/regex
|
// https://en.cppreference.com/w/cpp/regex
|
||||||
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
|
// https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
|
||||||
std::string expr =
|
std::string expr_chinese = "([\\u4e00-\\u9fff]+)";
|
||||||
"([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
|
std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)";
|
||||||
")";
|
|
||||||
|
std::string expr_both = expr_chinese + "|" + expr_not_chinese;
|
||||||
|
|
||||||
auto ws = ToWideString(text);
|
auto ws = ToWideString(text);
|
||||||
std::wstring wexpr = ToWideString(expr);
|
std::wstring wexpr_both = ToWideString(expr_both);
|
||||||
std::wregex we(wexpr);
|
std::wregex we_both(wexpr_both);
|
||||||
|
|
||||||
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
|
std::wstring wexpr_zh = ToWideString(expr_chinese);
|
||||||
|
std::wregex we_zh(wexpr_zh);
|
||||||
|
|
||||||
|
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both);
|
||||||
auto end = std::wsregex_iterator();
|
auto end = std::wsregex_iterator();
|
||||||
|
|
||||||
std::vector<TokenIDs> ans;
|
std::vector<TokenIDs> ans;
|
||||||
@@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl {
|
|||||||
for (std::wsregex_iterator i = begin; i != end; ++i) {
|
for (std::wsregex_iterator i = begin; i != end; ++i) {
|
||||||
std::wsmatch match = *i;
|
std::wsmatch match = *i;
|
||||||
std::wstring match_str = match.str();
|
std::wstring match_str = match.str();
|
||||||
|
|
||||||
auto ms = ToString(match_str);
|
auto ms = ToString(match_str);
|
||||||
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
|
uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
|
||||||
|
|
||||||
std::vector<std::vector<int32_t>> ids_vec;
|
std::vector<std::vector<int32_t>> ids_vec;
|
||||||
|
if (std::regex_match(match_str, we_zh)) {
|
||||||
if (c < 0x80) {
|
|
||||||
if (debug_) {
|
|
||||||
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
|
||||||
}
|
|
||||||
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
|
||||||
} else {
|
|
||||||
if (debug_) {
|
if (debug_) {
|
||||||
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
|
SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
|
||||||
}
|
}
|
||||||
ids_vec = ConvertChineseToTokenIDs(ms);
|
ids_vec = ConvertChineseToTokenIDs(ms);
|
||||||
|
} else {
|
||||||
|
if (debug_) {
|
||||||
|
SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto &ids : ids_vec) {
|
for (const auto &ids : ids_vec) {
|
||||||
@@ -315,7 +306,8 @@ class KokoroMultiLangLexicon::Impl {
|
|||||||
this_sentence.push_back(space_id);
|
this_sentence.push_back(space_id);
|
||||||
} else {
|
} else {
|
||||||
if (debug_) {
|
if (debug_) {
|
||||||
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
|
SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'",
|
||||||
|
word.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
piper::eSpeakPhonemeConfig config;
|
piper::eSpeakPhonemeConfig config;
|
||||||
|
|||||||
86
sherpa-onnx/csrc/regex-lang-test.cc
Normal file
86
sherpa-onnx/csrc/regex-lang-test.cc
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
// sherpa-onnx/csrc/regex-lang-test.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2025 Xiaomi Corporation
|
||||||
|
|
||||||
|
#include <regex> // NOLINT
|
||||||
|
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
#include "sherpa-onnx/csrc/text-utils.cc"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
static void TestLang(const std::string &expr, const std::string &text,
|
||||||
|
const std::vector<std::string> &expected) {
|
||||||
|
auto ws = ToWideString(text);
|
||||||
|
std::wstring wexpr = ToWideString(expr);
|
||||||
|
std::wregex we(wexpr);
|
||||||
|
|
||||||
|
auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
|
||||||
|
auto end = std::wsregex_iterator();
|
||||||
|
int32_t k = 0;
|
||||||
|
for (std::wsregex_iterator i = begin; i != end; ++i) {
|
||||||
|
std::wsmatch match = *i;
|
||||||
|
std::wstring match_str = match.str();
|
||||||
|
auto ms = ToString(match_str);
|
||||||
|
std::cout << ms << "\n";
|
||||||
|
EXPECT_EQ(ms, expected[k]);
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
EXPECT_EQ(k, expected.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(German, Case1) {
|
||||||
|
std::cout << "----------Test German----------";
|
||||||
|
// see https://character-table.netlify.app/german/
|
||||||
|
std::string expr =
|
||||||
|
"([\\u0020-\\u005f\\u0061-"
|
||||||
|
"\\u007d\\u00a0\\u00a7\\u00a9\\u00ab\\u00bb\\u00c4\\u00d6\\u00dc\\u00df\\"
|
||||||
|
"u00e4\\u00f6\\u00fc\\u2010-\\u2011\\u2013-"
|
||||||
|
"\\u2014\\u2018\\u201a\\u201c\\u201e\\u2026\\u2030\\u20ac]+)";
|
||||||
|
|
||||||
|
std::string text =
|
||||||
|
"开始Übeltäter übergibt Ärzten 中间öfters äußerst ätzende Öle结束3€";
|
||||||
|
|
||||||
|
std::vector<std::string> expected = {"Übeltäter übergibt Ärzten ",
|
||||||
|
"öfters äußerst ätzende Öle", "3€"};
|
||||||
|
|
||||||
|
TestLang(expr, text, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(French, Case1) {
|
||||||
|
std::string expr =
|
||||||
|
"([\\u0020-\\u005f\\u0061-"
|
||||||
|
"\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u00ab\\u00b2-"
|
||||||
|
"\\u00b3\\u00bb\\u00c0\\u00c2\\u00c6-\\u00cb\\u00ce-"
|
||||||
|
"\\u00cf\\u00d4\\u00d9\\u00db-\\u00dc\\u00e0\\u00e2\\u00e6-"
|
||||||
|
"\\u00eb\\u00ee-\\u00ef\\u00f4\\u00f9\\u00fb-\\u00fc\\u00ff\\u0152-"
|
||||||
|
"\\u0153\\u0178\\u02b3\\u02e2\\u1d48-\\u1d49\\u2010-\\u2011\\u2013-"
|
||||||
|
"\\u2014\\u2019\\u201c-\\u201d\\u2020-\\u2021\\u2026\\u202f-"
|
||||||
|
"\\u2030\\u20ac\\u2212]+)";
|
||||||
|
std::string text =
|
||||||
|
"L'été, 一avec son ciel bleuâtre, 二est un moment où, 三Noël, maçon";
|
||||||
|
std::vector<std::string> expected = {
|
||||||
|
"L'été, ",
|
||||||
|
"avec son ciel bleuâtre, ",
|
||||||
|
"est un moment où, ",
|
||||||
|
"Noël, maçon",
|
||||||
|
};
|
||||||
|
TestLang(expr, text, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(English, Case1) {
|
||||||
|
// https://character-table.netlify.app/english/
|
||||||
|
std::string expr =
|
||||||
|
"([\\u0020-\\u005f\\u0061-\\u007a\\u007c\\u00a0\\u00a7\\u00a9\\u2010-"
|
||||||
|
"\\u2011\\u2013-\\u2014\\u2018-\\u2019\\u201c-\\u201d\\u2020-"
|
||||||
|
"\\u2021\\u2026\\u2030\\u2032-\\u2033\\u20ac]+)";
|
||||||
|
std::string text = "一how are you doing? 二Thank you!";
|
||||||
|
|
||||||
|
std::vector<std::string> expected = {
|
||||||
|
"how are you doing? ",
|
||||||
|
"Thank you!",
|
||||||
|
};
|
||||||
|
TestLang(expr, text, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
@@ -8,6 +8,14 @@
|
|||||||
|
|
||||||
namespace sherpa_onnx {
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
TEST(ToLowerCase, WideString) {
|
||||||
|
std::string text =
|
||||||
|
"Hallo! Übeltäter übergibt Ärzten öfters äußerst ätzende Öle 3€";
|
||||||
|
auto t = ToLowerCase(text);
|
||||||
|
std::cout << text << "\n";
|
||||||
|
std::cout << t << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
TEST(RemoveInvalidUtf8Sequences, Case1) {
|
TEST(RemoveInvalidUtf8Sequences, Case1) {
|
||||||
std::vector<uint8_t> v = {
|
std::vector<uint8_t> v = {
|
||||||
0xe4, 0xbb, 0x8a, // 今
|
0xe4, 0xbb, 0x8a, // 今
|
||||||
|
|||||||
@@ -8,8 +8,11 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cctype>
|
#include <cctype>
|
||||||
|
#include <codecvt>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <cwctype>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
#include <locale>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
@@ -389,10 +392,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string ToLowerCase(const std::string &s) {
|
std::string ToLowerCase(const std::string &s) {
|
||||||
std::string ans(s.size(), 0);
|
return ToString(ToLowerCase(ToWideString(s)));
|
||||||
std::transform(s.begin(), s.end(), ans.begin(),
|
|
||||||
[](unsigned char c) { return std::tolower(c); });
|
|
||||||
return ans;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ToLowerCase(std::string *in_out) {
|
void ToLowerCase(std::string *in_out) {
|
||||||
@@ -400,6 +400,66 @@ void ToLowerCase(std::string *in_out) {
|
|||||||
[](unsigned char c) { return std::tolower(c); });
|
[](unsigned char c) { return std::tolower(c); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::wstring ToLowerCase(const std::wstring &s) {
|
||||||
|
std::wstring ans(s.size(), 0);
|
||||||
|
std::transform(s.begin(), s.end(), ans.begin(), [](wchar_t c) -> wchar_t {
|
||||||
|
switch (c) {
|
||||||
|
// French
|
||||||
|
case L'À':
|
||||||
|
return L'à';
|
||||||
|
case L'Â':
|
||||||
|
return L'â';
|
||||||
|
case L'Æ':
|
||||||
|
return L'æ';
|
||||||
|
case L'Ç':
|
||||||
|
return L'ç';
|
||||||
|
case L'È':
|
||||||
|
return L'è';
|
||||||
|
case L'É':
|
||||||
|
return L'é';
|
||||||
|
case L'Ë':
|
||||||
|
return L'ë';
|
||||||
|
case L'Î':
|
||||||
|
return L'î';
|
||||||
|
case L'Ï':
|
||||||
|
return L'ï';
|
||||||
|
case L'Ô':
|
||||||
|
return L'ô';
|
||||||
|
case L'Ù':
|
||||||
|
return L'ù';
|
||||||
|
case L'Û':
|
||||||
|
return L'û';
|
||||||
|
case L'Ü':
|
||||||
|
return L'ü';
|
||||||
|
|
||||||
|
// others
|
||||||
|
case L'Á':
|
||||||
|
return L'á';
|
||||||
|
case L'Í':
|
||||||
|
return L'í';
|
||||||
|
case L'Ó':
|
||||||
|
return L'ó';
|
||||||
|
case L'Ú':
|
||||||
|
return L'ú';
|
||||||
|
case L'Ñ':
|
||||||
|
return L'ñ';
|
||||||
|
case L'Ì':
|
||||||
|
return L'ì';
|
||||||
|
case L'Ò':
|
||||||
|
return L'ò';
|
||||||
|
case L'Ä':
|
||||||
|
return L'ä';
|
||||||
|
case L'Ö':
|
||||||
|
return L'ö';
|
||||||
|
// TODO(fangjun): Add more
|
||||||
|
|
||||||
|
default:
|
||||||
|
return std::towlower(c);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) {
|
static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) {
|
||||||
return low <= x && x <= high;
|
return low <= x && x <= high;
|
||||||
}
|
}
|
||||||
@@ -625,4 +685,18 @@ std::string Gb2312ToUtf8(const std::string &text) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
std::wstring ToWideString(const std::string &s) {
|
||||||
|
// see
|
||||||
|
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.from_bytes(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ToString(const std::wstring &s) {
|
||||||
|
// see
|
||||||
|
// https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.to_bytes(s);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|||||||
@@ -124,6 +124,8 @@ std::vector<std::string> SplitUtf8(const std::string &text);
|
|||||||
std::string ToLowerCase(const std::string &s);
|
std::string ToLowerCase(const std::string &s);
|
||||||
void ToLowerCase(std::string *in_out);
|
void ToLowerCase(std::string *in_out);
|
||||||
|
|
||||||
|
std::wstring ToLowerCase(const std::wstring &s);
|
||||||
|
|
||||||
std::string RemoveInvalidUtf8Sequences(const std::string &text,
|
std::string RemoveInvalidUtf8Sequences(const std::string &text,
|
||||||
bool show_debug_msg = false);
|
bool show_debug_msg = false);
|
||||||
|
|
||||||
@@ -139,6 +141,10 @@ bool IsGB2312(const std::string &text);
|
|||||||
std::string Gb2312ToUtf8(const std::string &text);
|
std::string Gb2312ToUtf8(const std::string &text);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
std::wstring ToWideString(const std::string &s);
|
||||||
|
|
||||||
|
std::string ToString(const std::wstring &s);
|
||||||
|
|
||||||
} // namespace sherpa_onnx
|
} // namespace sherpa_onnx
|
||||||
|
|
||||||
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
|
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_
|
||||||
|
|||||||
Reference in New Issue
Block a user