// sherpa-onnx/csrc/utils.cc // // Copyright 2023 Xiaomi Corporation #include "sherpa-onnx/csrc/utils.h" #include #include #include #include #include #include "sherpa-onnx/csrc/log.h" #include "sherpa-onnx/csrc/macros.h" namespace sherpa_onnx { bool EncodeHotwords(std::istream &is, const SymbolTable &symbol_table, std::vector> *hotwords) { hotwords->clear(); std::vector tmp; std::string line; std::string word; while (std::getline(is, line)) { std::istringstream iss(line); std::vector syms; while (iss >> word) { if (word.size() >= 3) { // For BPE-based models, we replace ▁ with a space // Unicode 9601, hex 0x2581, utf8 0xe29681 const uint8_t *p = reinterpret_cast(word.c_str()); if (p[0] == 0xe2 && p[1] == 0x96 && p[2] == 0x81) { word = word.replace(0, 3, " "); } } if (symbol_table.contains(word)) { int32_t number = symbol_table[word]; tmp.push_back(number); } else { SHERPA_ONNX_LOGE( "Cannot find ID for hotword %s at line: %s. (Hint: words on " "the " "same line are separated by spaces)", word.c_str(), line.c_str()); return false; } } hotwords->push_back(std::move(tmp)); } return true; } } // namespace sherpa_onnx