Support removing invalid utf-8 sequences. (#1648)

This commit is contained in:
Fangjun Kuang
2024-12-25 19:32:13 +08:00
committed by GitHub
parent 08d771337b
commit b6f0f5fc2e
6 changed files with 164 additions and 0 deletions

View File

@@ -124,6 +124,9 @@ std::vector<std::string> SplitUtf8(const std::string &text);
std::string ToLowerCase(const std::string &s);
void ToLowerCase(std::string *in_out);
std::string RemoveInvalidUtf8Sequences(const std::string &text,
bool show_debug_msg = false);
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_