Refactor hotwords,support loading hotwords from file (#296)
This commit is contained in:
54
sherpa-onnx/csrc/utils.cc
Normal file
54
sherpa-onnx/csrc/utils.cc
Normal file
@@ -0,0 +1,54 @@
|
||||
// sherpa-onnx/csrc/utils.cc
|
||||
//
|
||||
// Copyright 2023 Xiaomi Corporation
|
||||
|
||||
#include "sherpa-onnx/csrc/utils.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/log.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
bool EncodeHotwords(std::istream &is, const SymbolTable &symbol_table,
|
||||
std::vector<std::vector<int32_t>> *hotwords) {
|
||||
hotwords->clear();
|
||||
std::vector<int32_t> tmp;
|
||||
std::string line;
|
||||
std::string word;
|
||||
|
||||
while (std::getline(is, line)) {
|
||||
std::istringstream iss(line);
|
||||
std::vector<std::string> syms;
|
||||
while (iss >> word) {
|
||||
if (word.size() >= 3) {
|
||||
// For BPE-based models, we replace ▁ with a space
|
||||
// Unicode 9601, hex 0x2581, utf8 0xe29681
|
||||
const uint8_t *p = reinterpret_cast<const uint8_t *>(word.c_str());
|
||||
if (p[0] == 0xe2 && p[1] == 0x96 && p[2] == 0x81) {
|
||||
word = word.replace(0, 3, " ");
|
||||
}
|
||||
}
|
||||
if (symbol_table.contains(word)) {
|
||||
int32_t number = symbol_table[word];
|
||||
tmp.push_back(number);
|
||||
} else {
|
||||
SHERPA_ONNX_LOGE(
|
||||
"Cannot find ID for hotword %s at line: %s. (Hint: words on "
|
||||
"the "
|
||||
"same line are separated by spaces)",
|
||||
word.c_str(), line.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
hotwords->push_back(std::move(tmp));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
Reference in New Issue
Block a user