Fix utf8 spliting for English (#386)

2023-10-25 14:55:27 +08:00
parent 6e5efa48c5
commit 29a5d06691
1 changed files with 54 additions and 1 deletions
--- a/sherpa-onnx/csrc/text-utils.cc
+++ b/sherpa-onnx/csrc/text-utils.cc
@@ -162,10 +162,63 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
                                  bool omit_empty_strings,
                                  std::vector<double> *out);
 static std::vector<std::string> MergeCharactersIntoWords(
    const std::vector<std::string> &words) {
  std::vector<std::string> ans;
  int32_t n = static_cast<int32_t>(words.size());
  int32_t i = 0;
  int32_t prev = -1;
  while (i < n) {
    const auto &w = words[i];
    if (w.size() > 1 ||
        (w.size() == 1 && (std::ispunct(w[0]) || std::isspace(w[0])))) {
      if (prev != -1) {
        std::string t;
        for (; prev < i; ++prev) {
          t.append(words[prev]);
        }
        prev = -1;
        ans.push_back(std::move(t));
      }
      if (!std::isspace(w[0])) {
        ans.push_back(w);
      }
      ++i;
      continue;
    }
    if (w.size() == 1) {
      if (prev == -1) {
        prev = i;
      }
      ++i;
      continue;
    }
    SHERPA_ONNX_LOGE("Ignore %s", w.c_str());
    ++i;
  }
  if (prev != -1) {
    std::string t;
    for (; prev < i; ++prev) {
      t.append(words[prev]);
    }
    ans.push_back(std::move(t));
  }
  return ans;
 }
 std::vector<std::string> SplitUtf8(const std::string &text) {
  const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str());
  const uint8_t *end = begin + text.size();
  // Note that English words are split into single characters.
  // We need to invoke MergeCharactersIntoWords() to merge them
  std::vector<std::string> ans;
  auto start = begin;
@@ -195,7 +248,7 @@ std::vector<std::string> SplitUtf8(const std::string &text) {
    }
  }
-  return ans;
+  return MergeCharactersIntoWords(ans);
 }
 }  // namespace sherpa_onnx