Fix Byte BPE string results for Python. (#512)

It ignores invalid UTF8 strings.
This commit is contained in:
Fangjun Kuang
2024-01-03 16:03:24 +08:00
committed by GitHub
parent d01142173a
commit e215d0c39a
6 changed files with 54 additions and 3 deletions

View File

@@ -5,7 +5,9 @@
#ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
#include <ios>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
@@ -42,6 +44,15 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
}
auto sym = sym_table[src.tokens[i]];
text.append(sym);
if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}
r.tokens.push_back(std::move(sym));
}
r.text = std::move(text);

View File

@@ -6,8 +6,10 @@
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
#include <fstream>
#include <ios>
#include <memory>
#include <regex> // NOLINT
#include <sstream>
#include <string>
#include <utility>
#include <vector>
@@ -44,6 +46,14 @@ static OfflineRecognitionResult Convert(
auto sym = sym_table[i];
text.append(sym);
if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}
r.tokens.push_back(std::move(sym));
}
r.text = std::move(text);

View File

@@ -6,7 +6,9 @@
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_
#include <algorithm>
#include <ios>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
@@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
auto sym = sym_table[i];
r.text.append(sym);
if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}
r.tokens.push_back(std::move(sym));
}

View File

@@ -6,8 +6,10 @@
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
#include <algorithm>
#include <ios>
#include <memory>
#include <regex> // NOLINT
#include <sstream>
#include <string>
#include <utility>
#include <vector>
@@ -47,6 +49,15 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
auto sym = sym_table[i];
r.text.append(sym);
if (sym.size() == 1 && sym[0] != ' ') {
// for byte bpe models
std::ostringstream os;
os << "<0x" << std::hex << std::uppercase
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
sym = os.str();
}
r.tokens.push_back(std::move(sym));
}