From 43b2b7760de8e3d103a0081877759c84a0e1472e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 22 Sep 2023 13:28:19 +0800 Subject: [PATCH] Fix tokens processing for byte-level BPE (#333) --- CMakeLists.txt | 2 +- sherpa-onnx/csrc/offline-stream.cc | 53 ++++++++++++++++++++++-------- sherpa-onnx/csrc/symbol-table.cc | 2 +- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86f0a29d..b111d535 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.7.17") +set(SHERPA_ONNX_VERSION "1.7.18") # Disable warning about # diff --git a/sherpa-onnx/csrc/offline-stream.cc b/sherpa-onnx/csrc/offline-stream.cc index e317d8d5..cf7ca341 100644 --- a/sherpa-onnx/csrc/offline-stream.cc +++ b/sherpa-onnx/csrc/offline-stream.cc @@ -8,9 +8,9 @@ #include #include +#include #include "kaldi-native-fbank/csrc/online-feature.h" -#include "nlohmann/json.hpp" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-recognizer.h" #include "sherpa-onnx/csrc/resample.h" @@ -256,25 +256,50 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const { return impl_->GetResult(); } std::string OfflineRecognitionResult::AsJsonString() const { - nlohmann::json j; - j["text"] = text; - j["tokens"] = tokens; -#if 1 - // This branch chooses number of decimal points to keep in - // the return json string std::ostringstream os; - os << "["; + os << "{"; + os << "\"text\"" + << ": "; + os << "\"" << text << "\"" + << ", "; + + os << "\"" + << "timestamps" + << "\"" + << ": "; + os << "\"["; + std::string sep = ""; for (auto t : timestamps) { os << sep << std::fixed << std::setprecision(2) << t; sep = ", "; } - os << "]"; - j["timestamps"] = os.str(); -#else - j["timestamps"] = timestamps; -#endif + os << "]\", "; - return j.dump(); + os << "\"" + << "tokens" + << "\"" + << ":"; + os << "["; + + sep = ""; + auto oldFlags = os.flags(); + for (const auto &t : tokens) { + if (t.size() == 1 && static_cast(t[0]) > 0x7f) { + const uint8_t *p = reinterpret_cast(t.c_str()); + os << sep << "\"" + << "<0x" << std::hex << std::uppercase << static_cast(p[0]) + << ">" + << "\""; + os.flags(oldFlags); + } else { + os << sep << "\"" << t << "\""; + } + sep = ", "; + } + os << "]"; + os << "}"; + + return os.str(); } } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/symbol-table.cc b/sherpa-onnx/csrc/symbol-table.cc index 6fb69d9b..0144cea4 100644 --- a/sherpa-onnx/csrc/symbol-table.cc +++ b/sherpa-onnx/csrc/symbol-table.cc @@ -51,7 +51,7 @@ void SymbolTable::Init(std::istream &is) { if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' && sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') { std::ostringstream os; - os << std::hex << (id - 3); + os << std::hex << std::uppercase << (id - 3); if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) { uint8_t i = id - 3;