- Avoid output like : `[' K', '<0x64>', '<0x79>', 'ť', ' a', '<0x75>', 'to', 'bu', '<0x73>', '<0x75>', ... ]` with regular 500 BPE units. - Don't rewrite 1-char tokens in range [ 0x20 (space) .. 0x7E (tilde) ]
This commit is contained in:
@@ -45,8 +45,10 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
|
|||||||
auto sym = sym_table[src.tokens[i]];
|
auto sym = sym_table[src.tokens[i]];
|
||||||
text.append(sym);
|
text.append(sym);
|
||||||
|
|
||||||
if (sym.size() == 1 && sym[0] != ' ') {
|
if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
|
||||||
// for byte bpe models
|
// for byte bpe models
|
||||||
|
// (but don't rewrite printable characters 0x20..0x7e,
|
||||||
|
// which collide with standard BPE units)
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
os << "<0x" << std::hex << std::uppercase
|
os << "<0x" << std::hex << std::uppercase
|
||||||
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
|||||||
@@ -46,8 +46,10 @@ static OfflineRecognitionResult Convert(
|
|||||||
auto sym = sym_table[i];
|
auto sym = sym_table[i];
|
||||||
text.append(sym);
|
text.append(sym);
|
||||||
|
|
||||||
if (sym.size() == 1 && sym[0] != ' ') {
|
if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
|
||||||
// for byte bpe models
|
// for byte bpe models,
|
||||||
|
// (but don't rewrite printable characters 0x20..0x7e,
|
||||||
|
// which collide with standard BPE units)
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
os << "<0x" << std::hex << std::uppercase
|
os << "<0x" << std::hex << std::uppercase
|
||||||
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
|||||||
@@ -38,8 +38,10 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
|
|||||||
|
|
||||||
r.text.append(sym);
|
r.text.append(sym);
|
||||||
|
|
||||||
if (sym.size() == 1 && sym[0] != ' ') {
|
if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
|
||||||
// for byte bpe models
|
// for byte bpe models
|
||||||
|
// (but don't rewrite printable characters 0x20..0x7e,
|
||||||
|
// which collide with standard BPE units)
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
os << "<0x" << std::hex << std::uppercase
|
os << "<0x" << std::hex << std::uppercase
|
||||||
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
|||||||
@@ -50,8 +50,10 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
|
|||||||
|
|
||||||
r.text.append(sym);
|
r.text.append(sym);
|
||||||
|
|
||||||
if (sym.size() == 1 && sym[0] != ' ') {
|
if (sym.size() == 1 && (sym[0] < 0x20 || sym[0] > 0x7e)) {
|
||||||
// for byte bpe models
|
// for byte bpe models
|
||||||
|
// (but don't rewrite printable characters 0x20..0x7e,
|
||||||
|
// which collide with standard BPE units)
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
os << "<0x" << std::hex << std::uppercase
|
os << "<0x" << std::hex << std::uppercase
|
||||||
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
|||||||
Reference in New Issue
Block a user