Fix tokens for byte-level BPE token. (#324)
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
|
||||
project(sherpa-onnx)
|
||||
|
||||
set(SHERPA_ONNX_VERSION "1.7.15")
|
||||
set(SHERPA_ONNX_VERSION "1.7.16")
|
||||
|
||||
# Disable warning about
|
||||
#
|
||||
|
||||
@@ -46,6 +46,19 @@ void SymbolTable::Init(std::istream &is) {
|
||||
}
|
||||
}
|
||||
|
||||
// for byte-level BPE
|
||||
// id 0 is blank, id 1 is sos/eos, id 2 is unk
|
||||
if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' &&
|
||||
sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') {
|
||||
std::ostringstream os;
|
||||
os << std::hex << (id - 3);
|
||||
|
||||
if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) {
|
||||
uint8_t i = id - 3;
|
||||
sym = std::string(&i, &i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
assert(!sym.empty());
|
||||
assert(sym2id_.count(sym) == 0);
|
||||
assert(id2sym_.count(id) == 0);
|
||||
|
||||
Reference in New Issue
Block a user