Fix Byte BPE string results for Python. (#512)
It ignores invalid UTF8 strings.
This commit is contained in:
@@ -5,7 +5,9 @@
|
|||||||
#ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
|
#ifndef SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
|
||||||
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
|
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_CTC_IMPL_H_
|
||||||
|
|
||||||
|
#include <ios>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@@ -42,6 +44,15 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src,
|
|||||||
}
|
}
|
||||||
auto sym = sym_table[src.tokens[i]];
|
auto sym = sym_table[src.tokens[i]];
|
||||||
text.append(sym);
|
text.append(sym);
|
||||||
|
|
||||||
|
if (sym.size() == 1 && sym[0] != ' ') {
|
||||||
|
// for byte bpe models
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "<0x" << std::hex << std::uppercase
|
||||||
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
sym = os.str();
|
||||||
|
}
|
||||||
|
|
||||||
r.tokens.push_back(std::move(sym));
|
r.tokens.push_back(std::move(sym));
|
||||||
}
|
}
|
||||||
r.text = std::move(text);
|
r.text = std::move(text);
|
||||||
|
|||||||
@@ -6,8 +6,10 @@
|
|||||||
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
|
#define SHERPA_ONNX_CSRC_OFFLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <ios>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <regex> // NOLINT
|
#include <regex> // NOLINT
|
||||||
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@@ -44,6 +46,14 @@ static OfflineRecognitionResult Convert(
|
|||||||
auto sym = sym_table[i];
|
auto sym = sym_table[i];
|
||||||
text.append(sym);
|
text.append(sym);
|
||||||
|
|
||||||
|
if (sym.size() == 1 && sym[0] != ' ') {
|
||||||
|
// for byte bpe models
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "<0x" << std::hex << std::uppercase
|
||||||
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
sym = os.str();
|
||||||
|
}
|
||||||
|
|
||||||
r.tokens.push_back(std::move(sym));
|
r.tokens.push_back(std::move(sym));
|
||||||
}
|
}
|
||||||
r.text = std::move(text);
|
r.text = std::move(text);
|
||||||
|
|||||||
@@ -6,7 +6,9 @@
|
|||||||
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_
|
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_CTC_IMPL_H_
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <ios>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@@ -35,6 +37,15 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
|
|||||||
auto sym = sym_table[i];
|
auto sym = sym_table[i];
|
||||||
|
|
||||||
r.text.append(sym);
|
r.text.append(sym);
|
||||||
|
|
||||||
|
if (sym.size() == 1 && sym[0] != ' ') {
|
||||||
|
// for byte bpe models
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "<0x" << std::hex << std::uppercase
|
||||||
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
sym = os.str();
|
||||||
|
}
|
||||||
|
|
||||||
r.tokens.push_back(std::move(sym));
|
r.tokens.push_back(std::move(sym));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,8 +6,10 @@
|
|||||||
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
|
#define SHERPA_ONNX_CSRC_ONLINE_RECOGNIZER_TRANSDUCER_IMPL_H_
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <ios>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <regex> // NOLINT
|
#include <regex> // NOLINT
|
||||||
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@@ -47,6 +49,15 @@ static OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
|
|||||||
auto sym = sym_table[i];
|
auto sym = sym_table[i];
|
||||||
|
|
||||||
r.text.append(sym);
|
r.text.append(sym);
|
||||||
|
|
||||||
|
if (sym.size() == 1 && sym[0] != ' ') {
|
||||||
|
// for byte bpe models
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "<0x" << std::hex << std::uppercase
|
||||||
|
<< (static_cast<int32_t>(sym[0]) & 0xff) << ">";
|
||||||
|
sym = os.str();
|
||||||
|
}
|
||||||
|
|
||||||
r.tokens.push_back(std::move(sym));
|
r.tokens.push_back(std::move(sym));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -23,8 +23,12 @@ Args:
|
|||||||
static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
|
static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT
|
||||||
using PyClass = OfflineRecognitionResult;
|
using PyClass = OfflineRecognitionResult;
|
||||||
py::class_<PyClass>(*m, "OfflineRecognitionResult")
|
py::class_<PyClass>(*m, "OfflineRecognitionResult")
|
||||||
.def_property_readonly("text",
|
.def_property_readonly(
|
||||||
[](const PyClass &self) { return self.text; })
|
"text",
|
||||||
|
[](const PyClass &self) -> py::str {
|
||||||
|
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
|
||||||
|
self.text.size(), "ignore"));
|
||||||
|
})
|
||||||
.def_property_readonly("tokens",
|
.def_property_readonly("tokens",
|
||||||
[](const PyClass &self) { return self.tokens; })
|
[](const PyClass &self) { return self.tokens; })
|
||||||
.def_property_readonly(
|
.def_property_readonly(
|
||||||
|
|||||||
@@ -15,7 +15,11 @@ static void PybindOnlineRecognizerResult(py::module *m) {
|
|||||||
using PyClass = OnlineRecognizerResult;
|
using PyClass = OnlineRecognizerResult;
|
||||||
py::class_<PyClass>(*m, "OnlineRecognizerResult")
|
py::class_<PyClass>(*m, "OnlineRecognizerResult")
|
||||||
.def_property_readonly(
|
.def_property_readonly(
|
||||||
"text", [](PyClass &self) -> std::string { return self.text; })
|
"text",
|
||||||
|
[](PyClass &self) -> py::str {
|
||||||
|
return py::str(PyUnicode_DecodeUTF8(self.text.c_str(),
|
||||||
|
self.text.size(), "ignore"));
|
||||||
|
})
|
||||||
.def_property_readonly(
|
.def_property_readonly(
|
||||||
"tokens",
|
"tokens",
|
||||||
[](PyClass &self) -> std::vector<std::string> { return self.tokens; })
|
[](PyClass &self) -> std::vector<std::string> { return self.tokens; })
|
||||||
|
|||||||
Reference in New Issue
Block a user