Fix Byte BPE string results for Python. (#512)
It ignores invalid UTF8 strings.
This commit is contained in:
@@ -6,7 +6,9 @@
|
||||
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <ios>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
@@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
|
||||
auto sym = sym_table[i];
|
||||
|
||||
r.text.append(sym);
|
||||
|
||||
if (sym.size() == 1 && sym[0] != ' ') {
|
||||
// for byte bpe models
|
||||
std::ostringstream os;
|
||||
os << "<0x" << std::hex << std::uppercase
|
||||
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||
sym = os.str();
|
||||
}
|
||||
|
||||
r.tokens.push_back(std::move(sym));
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user