decoder for open vocabulary keyword spotting (#505)

* various fixes to ContextGraph to support open vocabulary keywords decoder

* Add keyword spotter runtime

* Add binary

* First version works

* Minor fixes

* update text2token

* default values

* Add jni for kws

* add kws android project

* Minor fixes

* Remove unused interface

* Minor fixes

* Add workflow

* handle extra info in texts

* Minor fixes

* Add more comments

* Fix ci

* fix cpp style

* Add input box in android demo so that users can specify their keywords

* Fix cpp style

* Fix comments

* Minor fixes

* Minor fixes

* minor fixes

* Minor fixes

* Minor fixes

* Add CI

* Fix code style

* cpplint

* Fix comments

* Fix error
This commit is contained in:
Wei Kang
2024-01-20 22:52:41 +08:00
committed by GitHub
parent bf1dd3daf6
commit b6c020901a
77 changed files with 3316 additions and 68 deletions

View File

@@ -36,13 +36,44 @@ import argparse
from sherpa_onnx import text2token
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
type=str,
required=True,
help="Path to the input texts",
help="""Path to the input texts.
Each line in the texts contains the original phrase, it might also contain some
extra items, for example, the boosting score (startting with :), the triggering
threshold (startting with #, only used in keyword spotting task) and the original
phrase (startting with @). Note: extra items will be kept in the output.
example input 1 (tokens_type = ppinyin):
小爱同学 :2.0 #0.6 @小爱同学
你好问问 :3.5 @你好问问
小艺小艺 #0.6 @小艺小艺
example output 1:
x iǎo ài t óng x ué :2.0 #0.6 @小爱同学
n ǐ h ǎo w èn w èn :3.5 @你好问问
x iǎo y ì x iǎo y ì #0.6 @小艺小艺
example input 2 (tokens_type = bpe):
HELLO WORLD :1.5 #0.4
HI GOOGLE :2.0 #0.8
HEY SIRI #0.35
example output 2:
▁HE LL O ▁WORLD :1.5 #0.4
▁HI ▁GO O G LE :2.0 #0.8
▁HE Y ▁S I RI #0.35
""",
)
parser.add_argument(
@@ -56,7 +87,11 @@ def get_args():
"--tokens-type",
type=str,
required=True,
help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
choices=["cjkchar", "bpe", "cjkchar+bpe", "fpinyin", "ppinyin"],
help="""The type of modeling units, should be cjkchar, bpe, cjkchar+bpe, fpinyin or ppinyin.
fpinyin means full pinyin, each cjkchar has a pinyin(with tone).
ppinyin means partial pinyin, it splits pinyin into initial and final,
""",
)
parser.add_argument(
@@ -79,9 +114,21 @@ def main():
args = get_args()
texts = []
# extra information like boosting score (start with :), triggering threshold (start with #)
# original keyword (start with @)
extra_info = []
with open(args.text, "r", encoding="utf8") as f:
for line in f:
texts.append(line.strip())
extra = []
text = []
toks = line.strip().split()
for tok in toks:
if tok[0] == ":" or tok[0] == "#" or tok[0] == "@":
extra.append(tok)
else:
text.append(tok)
texts.append(" ".join(text))
extra_info.append(extra)
encoded_texts = text2token(
texts,
tokens=args.tokens,
@@ -89,7 +136,8 @@ def main():
bpe_model=args.bpe_model,
)
with open(args.output, "w", encoding="utf8") as f:
for txt in encoded_texts:
for i, txt in enumerate(encoded_texts):
txt += extra_info[i]
f.write(" ".join(txt) + "\n")