Encode hotwords in C++ side (#828)

* Encode hotwords in C++ side
2024-05-20 19:41:36 +08:00
parent 8af2af8466
commit b012b78ceb
43 changed files with 714 additions and 102 deletions
--- a/python-api-examples/offline-decode-files.py
+++ b/python-api-examples/offline-decode-files.py
@@ -110,11 +110,9 @@ def get_args():
        type=str,
        default="",
        help="""
-        The file containing hotwords, one words/phrases per line, and for each
-        phrase the bpe/cjkchar are separated by a space. For example:
-
-        ▁HE LL O ▁WORLD
-        你 好 世 界
+        The file containing hotwords, one words/phrases per line, like
+        HELLO WORLD
+        你好世界
        """,
    )

@@ -128,6 +126,28 @@ def get_args():
        """,
    )

+    parser.add_argument(
+        "--modeling-unit",
+        type=str,
+        default="",
+        help="""
+        The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
+        Used only when hotwords-file is given.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-vocab",
+        type=str,
+        default="",
+        help="""
+        The path to the bpe vocabulary, the bpe vocabulary is generated by
+        sentencepiece, you can also export the bpe vocabulary through a bpe model
+        by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
+        and modeling-unit is bpe or cjkchar+bpe.
+        """,
+    )
+
    parser.add_argument(
        "--encoder",
        default="",
@@ -347,6 +367,8 @@ def main():
            decoding_method=args.decoding_method,
            hotwords_file=args.hotwords_file,
            hotwords_score=args.hotwords_score,
+            modeling_unit=args.modeling_unit,
+            bpe_vocab=args.bpe_vocab,
            blank_penalty=args.blank_penalty,
            debug=args.debug,
        )
--- a/python-api-examples/online-decode-files.py
+++ b/python-api-examples/online-decode-files.py
@@ -198,11 +198,9 @@ def get_args():
        type=str,
        default="",
        help="""
-        The file containing hotwords, one words/phrases per line, and for each
-        phrase the bpe/cjkchar are separated by a space. For example:
-
-        ▁HE LL O ▁WORLD
-        你 好 世 界
+        The file containing hotwords, one words/phrases per line, like
+        HELLO WORLD
+        你好世界
        """,
    )

@@ -216,6 +214,28 @@ def get_args():
        """,
    )

+    parser.add_argument(
+        "--modeling-unit",
+        type=str,
+        default="",
+        help="""
+        The modeling unit of the model, valid values are cjkchar, bpe, cjkchar+bpe.
+        Used only when hotwords-file is given.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-vocab",
+        type=str,
+        default="",
+        help="""
+        The path to the bpe vocabulary, the bpe vocabulary is generated by
+        sentencepiece, you can also export the bpe vocabulary through a bpe model
+        by `scripts/export_bpe_vocab.py`. Used only when hotwords-file is given
+        and modeling-unit is bpe or cjkchar+bpe.
+        """,
+    )
+
    parser.add_argument(
        "--blank-penalty",
        type=float,
@@ -302,6 +322,8 @@ def main():
            lm_scale=args.lm_scale,
            hotwords_file=args.hotwords_file,
            hotwords_score=args.hotwords_score,
+            modeling_unit=args.modeling_unit,
+            bpe_vocab=args.bpe_vocab,
            blank_penalty=args.blank_penalty,
        )
    elif args.zipformer2_ctc: