From 323f532ad2225ef37102f41c83e24f8067a468c3 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 13 Oct 2023 10:51:59 +0800 Subject: [PATCH] Fix symbol table for byte bpe (#361) --- sherpa-onnx/csrc/symbol-table.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sherpa-onnx/csrc/symbol-table.cc b/sherpa-onnx/csrc/symbol-table.cc index 0144cea4..6898d898 100644 --- a/sherpa-onnx/csrc/symbol-table.cc +++ b/sherpa-onnx/csrc/symbol-table.cc @@ -60,7 +60,16 @@ void SymbolTable::Init(std::istream &is) { } assert(!sym.empty()); - assert(sym2id_.count(sym) == 0); + + // for byte bpe, after replacing ▁ with a space, whose ascii is also 0x20, + // there is a conflict between the real byte 0x20 and ▁, so we disable + // the following check. + // + // Note: Only id2sym_ matters as we use it to convert ID to symbols. + if (sym != " ") { + assert(sym2id_.count(sym) == 0); + } + assert(id2sym_.count(id) == 0); sym2id_.insert({sym, id});