Fix tokens for byte-level BPE token. (#324)

This commit is contained in:
Fangjun Kuang
2023-09-20 07:49:53 +08:00
committed by GitHub
parent bd173b27cc
commit 6afa9c85f6
2 changed files with 14 additions and 1 deletions

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)
set(SHERPA_ONNX_VERSION "1.7.15")
set(SHERPA_ONNX_VERSION "1.7.16")
# Disable warning about
#

View File

@@ -46,6 +46,19 @@ void SymbolTable::Init(std::istream &is) {
}
}
// for byte-level BPE
// id 0 is blank, id 1 is sos/eos, id 2 is unk
if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' &&
sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') {
std::ostringstream os;
os << std::hex << (id - 3);
if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) {
uint8_t i = id - 3;
sym = std::string(&i, &i + 1);
}
}
assert(!sym.empty());
assert(sym2id_.count(sym) == 0);
assert(id2sym_.count(id) == 0);