From 29a5d06691fe9d9e8cd12896d425ccbf91b73408 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 25 Oct 2023 14:55:27 +0800 Subject: [PATCH] Fix utf8 spliting for English (#386) --- sherpa-onnx/csrc/text-utils.cc | 55 +++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index 3aefbf9e..a1843ce5 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -162,10 +162,63 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, bool omit_empty_strings, std::vector *out); +static std::vector MergeCharactersIntoWords( + const std::vector &words) { + std::vector ans; + + int32_t n = static_cast(words.size()); + int32_t i = 0; + int32_t prev = -1; + + while (i < n) { + const auto &w = words[i]; + if (w.size() > 1 || + (w.size() == 1 && (std::ispunct(w[0]) || std::isspace(w[0])))) { + if (prev != -1) { + std::string t; + for (; prev < i; ++prev) { + t.append(words[prev]); + } + prev = -1; + ans.push_back(std::move(t)); + } + + if (!std::isspace(w[0])) { + ans.push_back(w); + } + ++i; + continue; + } + + if (w.size() == 1) { + if (prev == -1) { + prev = i; + } + ++i; + continue; + } + + SHERPA_ONNX_LOGE("Ignore %s", w.c_str()); + ++i; + } + + if (prev != -1) { + std::string t; + for (; prev < i; ++prev) { + t.append(words[prev]); + } + ans.push_back(std::move(t)); + } + + return ans; +} + std::vector SplitUtf8(const std::string &text) { const uint8_t *begin = reinterpret_cast(text.c_str()); const uint8_t *end = begin + text.size(); + // Note that English words are split into single characters. + // We need to invoke MergeCharactersIntoWords() to merge them std::vector ans; auto start = begin; @@ -195,7 +248,7 @@ std::vector SplitUtf8(const std::string &text) { } } - return ans; + return MergeCharactersIntoWords(ans); } } // namespace sherpa_onnx