Fix hotwords OOV log (#1139)

This commit is contained in:
Wei Kang
2024-07-16 19:41:31 +08:00
committed by GitHub
parent 960eb7529e
commit 5b1fa8750f
2 changed files with 15 additions and 11 deletions

View File

@@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines,
break; break;
default: default:
SHERPA_ONNX_LOGE( SHERPA_ONNX_LOGE(
"Cannot find ID for token %s at line: %s. (Hint: words on " "Cannot find ID for token %s at line: %s. (Hint: Check the "
"the same line are separated by spaces)", "tokens.txt see if %s in it)",
word.c_str(), line.c_str()); word.c_str(), line.c_str(), word.c_str());
has_oov = true; has_oov = true;
break; break;
} }

View File

@@ -4,6 +4,7 @@ import re
from pathlib import Path from pathlib import Path
from typing import List, Optional, Union from typing import List, Optional, Union
def text2token( def text2token(
texts: List[str], texts: List[str],
tokens: str, tokens: str,
@@ -35,18 +36,18 @@ def text2token(
try: try:
import sentencepiece as spm import sentencepiece as spm
except ImportError: except ImportError:
print('Please run') print("Please run")
print(' pip install sentencepiece') print(" pip install sentencepiece")
print('before you continue') print("before you continue")
raise raise
try: try:
from pypinyin import pinyin from pypinyin import pinyin
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
except ImportError: except ImportError:
print('Please run') print("Please run")
print(' pip install pypinyin') print(" pip install pypinyin")
print('before you continue') print("before you continue")
raise raise
assert Path(tokens).is_file(), f"File not exists, {tokens}" assert Path(tokens).is_file(), f"File not exists, {tokens}"
@@ -119,7 +120,10 @@ def text2token(
if txt in tokens_table: if txt in tokens_table:
text_list.append(tokens_table[txt] if output_ids else txt) text_list.append(tokens_table[txt] if output_ids else txt)
else: else:
print(f"OOV token : {txt}, skipping text : {text}.") print(
f"Can't find token {txt} in token table, check your "
f"tokens.txt see if {txt} in it. skipping text : {text}."
)
contain_oov = True contain_oov = True
break break
if contain_oov: if contain_oov: