decoder for open vocabulary keyword spotting (#505)

* various fixes to ContextGraph to support open vocabulary keywords decoder * Add keyword spotter runtime * Add binary * First version works * Minor fixes * update text2token * default values * Add jni for kws * add kws android project * Minor fixes * Remove unused interface * Minor fixes * Add workflow * handle extra info in texts * Minor fixes * Add more comments * Fix ci * fix cpp style * Add input box in android demo so that users can specify their keywords * Fix cpp style * Fix comments * Minor fixes * Minor fixes * minor fixes * Minor fixes * Minor fixes * Add CI * Fix code style * cpplint * Fix comments * Fix error
2024-01-20 22:52:41 +08:00
parent bf1dd3daf6
commit b6c020901a
77 changed files with 3316 additions and 68 deletions
--- a/scripts/text2token.py
+++ b/scripts/text2token.py
@@ -36,13 +36,44 @@ import argparse

 from sherpa_onnx import text2token

+
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--text",
        type=str,
        required=True,
-        help="Path to the input texts",
+        help="""Path to the input texts.
+
+        Each line in the texts contains the original phrase, it might also contain some
+        extra items, for example, the boosting score (startting with :), the triggering
+        threshold (startting with #, only used in keyword spotting task) and the original
+        phrase (startting with @). Note: extra items will be kept in the output.
+
+        example input 1 (tokens_type = ppinyin):
+
+        小爱同学 :2.0 #0.6 @小爱同学
+        你好问问 :3.5 @你好问问
+        小艺小艺 #0.6 @小艺小艺
+
+        example output 1:
+
+        x iǎo ài t óng x ué :2.0 #0.6 @小爱同学
+        n ǐ h ǎo w èn w èn :3.5 @你好问问
+        x iǎo y ì x iǎo y ì #0.6 @小艺小艺
+
+        example input 2 (tokens_type = bpe):
+
+        HELLO WORLD :1.5 #0.4
+        HI GOOGLE :2.0 #0.8
+        HEY SIRI #0.35
+
+        example output 2:
+
+        ▁HE LL O ▁WORLD :1.5 #0.4
+        ▁HI ▁GO O G LE :2.0 #0.8
+        ▁HE Y ▁S I RI #0.35
+        """,
    )

    parser.add_argument(
@@ -56,7 +87,11 @@ def get_args():
        "--tokens-type",
        type=str,
        required=True,
-        help="The type of modeling units, should be cjkchar, bpe or cjkchar+bpe",
+        choices=["cjkchar", "bpe", "cjkchar+bpe", "fpinyin", "ppinyin"],
+        help="""The type of modeling units, should be cjkchar, bpe, cjkchar+bpe, fpinyin or ppinyin.
+        fpinyin means full pinyin, each cjkchar has a pinyin(with tone).
+        ppinyin means partial pinyin, it splits pinyin into initial and final,
+        """,
    )

    parser.add_argument(
@@ -79,9 +114,21 @@ def main():
    args = get_args()

    texts = []
+    # extra information like boosting score (start with :), triggering threshold (start with #)
+    # original keyword (start with @)
+    extra_info = []
    with open(args.text, "r", encoding="utf8") as f:
        for line in f:
-            texts.append(line.strip())
+            extra = []
+            text = []
+            toks = line.strip().split()
+            for tok in toks:
+                if tok[0] == ":" or tok[0] == "#" or tok[0] == "@":
+                    extra.append(tok)
+                else:
+                    text.append(tok)
+            texts.append(" ".join(text))
+            extra_info.append(extra)
    encoded_texts = text2token(
        texts,
        tokens=args.tokens,
@@ -89,7 +136,8 @@ def main():
        bpe_model=args.bpe_model,
    )
    with open(args.output, "w", encoding="utf8") as f:
-        for txt in encoded_texts:
+        for i, txt in enumerate(encoded_texts):
+            txt += extra_info[i]
            f.write(" ".join(txt) + "\n")