Fix hotwords OOV log (#1139)

2024-07-16 19:41:31 +08:00
parent 960eb7529e
commit 5b1fa8750f
2 changed files with 15 additions and 11 deletions
--- a/sherpa-onnx/csrc/utils.cc
+++ b/sherpa-onnx/csrc/utils.cc
@@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines,
            break;
          default:
            SHERPA_ONNX_LOGE(
-                "Cannot find ID for token %s at line: %s. (Hint: words on "
+                "Cannot find ID for token %s at line: %s. (Hint: Check the "
-                "the same line are separated by spaces)",
+                "tokens.txt see if %s in it)",
-                word.c_str(), line.c_str());
+                word.c_str(), line.c_str(), word.c_str());
            has_oov = true;
            break;
        }
--- a/sherpa-onnx/python/sherpa_onnx/utils.py
+++ b/sherpa-onnx/python/sherpa_onnx/utils.py
@@ -4,6 +4,7 @@ import re
 from pathlib import Path
 from typing import List, Optional, Union
 def text2token(
    texts: List[str],
    tokens: str,
@@ -35,18 +36,18 @@ def text2token(
    try:
        import sentencepiece as spm
    except ImportError:
-        print('Please run')
+        print("Please run")
-        print('  pip install sentencepiece')
+        print("  pip install sentencepiece")
-        print('before you continue')
+        print("before you continue")
        raise
    try:
        from pypinyin import pinyin
        from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
    except ImportError:
-        print('Please run')
+        print("Please run")
-        print('  pip install pypinyin')
+        print("  pip install pypinyin")
-        print('before you continue')
+        print("before you continue")
        raise
    assert Path(tokens).is_file(), f"File not exists, {tokens}"
@@ -119,7 +120,10 @@ def text2token(
            if txt in tokens_table:
                text_list.append(tokens_table[txt] if output_ids else txt)
            else:
-                print(f"OOV token : {txt}, skipping text : {text}.")
+                print(
                    f"Can't find token {txt} in token table, check your "
                    f"tokens.txt see if {txt} in it. skipping text : {text}."
                )
                contain_oov = True
                break
        if contain_oov: