Fix spliting text by languages for kokoro tts. (#1849)

2025-02-13 18:19:34 +08:00
parent 115e9c2247
commit 944400e399
7 changed files with 204 additions and 36 deletions
--- a/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
+++ b/sherpa-onnx/csrc/kokoro-multi-lang-lexicon.cc
@@ -4,9 +4,7 @@

 #include "sherpa-onnx/csrc/kokoro-multi-lang-lexicon.h"

-#include <codecvt>
 #include <fstream>
-#include <locale>
 #include <regex>  // NOLINT
 #include <sstream>
 #include <strstream>
@@ -22,6 +20,8 @@
 #include "rawfile/raw_file_manager.h"
 #endif

+#include <codecvt>
+
 #include "cppjieba/Jieba.hpp"
 #include "espeak-ng/speak_lib.h"
 #include "phoneme_ids.hpp"
@@ -37,20 +37,6 @@ void CallPhonemizeEspeak(const std::string &text,
                         piper::eSpeakPhonemeConfig &config,  // NOLINT
                         std::vector<std::vector<piper::Phoneme>> *phonemes);

-static std::wstring ToWideString(const std::string &s) {
-  // see
-  // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
-  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-  return converter.from_bytes(s);
-}
-
-static std::string ToString(const std::wstring &s) {
-  // see
-  // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
-  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-  return converter.to_bytes(s);
-}
-
 class KokoroMultiLangLexicon::Impl {
 public:
  Impl(const std::string &tokens, const std::string &lexicon,
@@ -103,15 +89,19 @@ class KokoroMultiLangLexicon::Impl {

    // https://en.cppreference.com/w/cpp/regex
    // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
-    std::string expr =
-        "([;:,.?!'\"…\\(\\)“”])|([\\u4e00-\\u9fff]+)|([äöüßÄÖÜ\\u0000-\\u007f]+"
-        ")";
+    std::string expr_chinese = "([\\u4e00-\\u9fff]+)";
+    std::string expr_not_chinese = "([^\\u4e00-\\u9fff]+)";
+
+    std::string expr_both = expr_chinese + "|" + expr_not_chinese;

    auto ws = ToWideString(text);
-    std::wstring wexpr = ToWideString(expr);
-    std::wregex we(wexpr);
+    std::wstring wexpr_both = ToWideString(expr_both);
+    std::wregex we_both(wexpr_both);

-    auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
+    std::wstring wexpr_zh = ToWideString(expr_chinese);
+    std::wregex we_zh(wexpr_zh);
+
+    auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we_both);
    auto end = std::wsregex_iterator();

    std::vector<TokenIDs> ans;
@@ -119,21 +109,22 @@ class KokoroMultiLangLexicon::Impl {
    for (std::wsregex_iterator i = begin; i != end; ++i) {
      std::wsmatch match = *i;
      std::wstring match_str = match.str();
+
      auto ms = ToString(match_str);
      uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];

      std::vector<std::vector<int32_t>> ids_vec;
-
-      if (c < 0x80) {
-        if (debug_) {
-          SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
-        }
-        ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
-      } else {
+      if (std::regex_match(match_str, we_zh)) {
        if (debug_) {
          SHERPA_ONNX_LOGE("Chinese: %s", ms.c_str());
        }
        ids_vec = ConvertChineseToTokenIDs(ms);
+      } else {
+        if (debug_) {
+          SHERPA_ONNX_LOGE("Non-Chinese: %s", ms.c_str());
+        }
+
+        ids_vec = ConvertEnglishToTokenIDs(ms, meta_data_.voice);
      }

      for (const auto &ids : ids_vec) {
@@ -315,9 +306,10 @@ class KokoroMultiLangLexicon::Impl {
        this_sentence.push_back(space_id);
      } else {
        if (debug_) {
-          SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'", word.c_str());
+          SHERPA_ONNX_LOGE("Use espeak-ng to handle the OOV: '%s'",
+                           word.c_str());
        }
-        
+
        piper::eSpeakPhonemeConfig config;

        config.voice = voice;