Encode hotwords in C++ side (#828)

* Encode hotwords in C++ side
2024-05-20 19:41:36 +08:00
parent 8af2af8466
commit b012b78ceb
43 changed files with 714 additions and 102 deletions
--- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py
+++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py
@@ -49,6 +49,8 @@ class OfflineRecognizer(object):
        hotwords_file: str = "",
        hotwords_score: float = 1.5,
        blank_penalty: float = 0.0,
+        modeling_unit: str = "cjkchar",
+        bpe_vocab: str = "",
        debug: bool = False,
        provider: str = "cpu",
        model_type: str = "transducer",
@@ -91,6 +93,16 @@ class OfflineRecognizer(object):
            hotwords_file is given with modified_beam_search as decoding method.
          blank_penalty:
            The penalty applied on blank symbol during decoding.
+          modeling_unit:
+            The modeling unit of the model, commonly used units are bpe, cjkchar,
+            cjkchar+bpe, etc. Currently, it is needed only when hotwords are
+            provided, we need it to encode the hotwords into token sequence.
+            and the modeling unit is bpe or cjkchar+bpe.
+          bpe_vocab:
+            The vocabulary generated by google's sentencepiece program.
+            It is a file has two columns, one is the token, the other is
+            the log probability, you can get it from the directory where
+            your bpe model is generated. Only used when hotwords provided
          debug:
            True to show debug messages.
          provider:
@@ -107,6 +119,8 @@ class OfflineRecognizer(object):
            num_threads=num_threads,
            debug=debug,
            provider=provider,
+            modeling_unit=modeling_unit,
+            bpe_vocab=bpe_vocab,
            model_type=model_type,
        )