Fix hotwords OOV log (#1139)
This commit is contained in:
@@ -62,9 +62,9 @@ static bool EncodeBase(const std::vector<std::string> &lines,
|
|||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
SHERPA_ONNX_LOGE(
|
SHERPA_ONNX_LOGE(
|
||||||
"Cannot find ID for token %s at line: %s. (Hint: words on "
|
"Cannot find ID for token %s at line: %s. (Hint: Check the "
|
||||||
"the same line are separated by spaces)",
|
"tokens.txt see if %s in it)",
|
||||||
word.c_str(), line.c_str());
|
word.c_str(), line.c_str(), word.c_str());
|
||||||
has_oov = true;
|
has_oov = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import re
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
|
||||||
def text2token(
|
def text2token(
|
||||||
texts: List[str],
|
texts: List[str],
|
||||||
tokens: str,
|
tokens: str,
|
||||||
@@ -35,18 +36,18 @@ def text2token(
|
|||||||
try:
|
try:
|
||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print('Please run')
|
print("Please run")
|
||||||
print(' pip install sentencepiece')
|
print(" pip install sentencepiece")
|
||||||
print('before you continue')
|
print("before you continue")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from pypinyin import pinyin
|
from pypinyin import pinyin
|
||||||
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
|
from pypinyin.contrib.tone_convert import to_initials, to_finals_tone
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print('Please run')
|
print("Please run")
|
||||||
print(' pip install pypinyin')
|
print(" pip install pypinyin")
|
||||||
print('before you continue')
|
print("before you continue")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
assert Path(tokens).is_file(), f"File not exists, {tokens}"
|
assert Path(tokens).is_file(), f"File not exists, {tokens}"
|
||||||
@@ -119,7 +120,10 @@ def text2token(
|
|||||||
if txt in tokens_table:
|
if txt in tokens_table:
|
||||||
text_list.append(tokens_table[txt] if output_ids else txt)
|
text_list.append(tokens_table[txt] if output_ids else txt)
|
||||||
else:
|
else:
|
||||||
print(f"OOV token : {txt}, skipping text : {text}.")
|
print(
|
||||||
|
f"Can't find token {txt} in token table, check your "
|
||||||
|
f"tokens.txt see if {txt} in it. skipping text : {text}."
|
||||||
|
)
|
||||||
contain_oov = True
|
contain_oov = True
|
||||||
break
|
break
|
||||||
if contain_oov:
|
if contain_oov:
|
||||||
|
|||||||
Reference in New Issue
Block a user