Support printing input text and words after splitting (#376)

2023-10-20 12:06:30 +08:00
parent 2a932aca86
commit 3ba9a4932f
3 changed files with 37 additions and 3 deletions
--- a/sherpa-onnx/csrc/lexicon.cc
+++ b/sherpa-onnx/csrc/lexicon.cc
@@ -76,7 +76,9 @@ static std::vector<int32_t> ConvertTokensToIds(
 }

 Lexicon::Lexicon(const std::string &lexicon, const std::string &tokens,
-                 const std::string &punctuations, const std::string &language) {
+                 const std::string &punctuations, const std::string &language,
+                 bool debug /*= false*/)
+    : debug_(debug) {
  InitLanguage(language);
  InitTokens(tokens);
  InitLexicon(lexicon);
@@ -102,6 +104,20 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsChinese(
    const std::string &text) const {
  std::vector<std::string> words = SplitUtf8(text);

+  if (debug_) {
+    fprintf(stderr, "Input text in string: %s\n", text.c_str());
+    fprintf(stderr, "Input text in bytes:");
+    for (uint8_t c : text) {
+      fprintf(stderr, " %02x", c);
+    }
+    fprintf(stderr, "\n");
+    fprintf(stderr, "After splitting to words:");
+    for (const auto &w : words) {
+      fprintf(stderr, " %s", w.c_str());
+    }
+    fprintf(stderr, "\n");
+  }
+
  std::vector<int64_t> ans;

  auto sil = token2id_.at("sil");
@@ -134,6 +150,21 @@ std::vector<int64_t> Lexicon::ConvertTextToTokenIdsEnglish(
  ToLowerCase(&text);

  std::vector<std::string> words = SplitUtf8(text);
+
+  if (debug_) {
+    fprintf(stderr, "Input text (lowercase) in string: %s\n", text.c_str());
+    fprintf(stderr, "Input text in bytes:");
+    for (uint8_t c : text) {
+      fprintf(stderr, " %02x", c);
+    }
+    fprintf(stderr, "\n");
+    fprintf(stderr, "After splitting to words:");
+    for (const auto &w : words) {
+      fprintf(stderr, " %s", w.c_str());
+    }
+    fprintf(stderr, "\n");
+  }
+
  int32_t blank = token2id_.at(" ");

  std::vector<int64_t> ans;