Encode hotwords in C++ side (#828)

* Encode hotwords in C++ side
2024-05-20 19:41:36 +08:00
parent 8af2af8466
commit b012b78ceb
43 changed files with 714 additions and 102 deletions
--- a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py
+++ b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py
@@ -58,6 +58,8 @@ class OnlineRecognizer(object):
        hotwords_file: str = "",
        provider: str = "cpu",
        model_type: str = "",
+        modeling_unit: str = "cjkchar",
+        bpe_vocab: str = "",
        lm: str = "",
        lm_scale: float = 0.1,
        temperature_scale: float = 2.0,
@@ -136,6 +138,16 @@ class OnlineRecognizer(object):
          model_type:
            Online transducer model type. Valid values are: conformer, lstm,
            zipformer, zipformer2. All other values lead to loading the model twice.
+          modeling_unit:
+            The modeling unit of the model, commonly used units are bpe, cjkchar,
+            cjkchar+bpe, etc. Currently, it is needed only when hotwords are
+            provided, we need it to encode the hotwords into token sequence.
+          bpe_vocab:
+            The vocabulary generated by google's sentencepiece program.
+            It is a file has two columns, one is the token, the other is
+            the log probability, you can get it from the directory where
+            your bpe model is generated. Only used when hotwords provided
+            and the modeling unit is bpe or cjkchar+bpe.
        """
        self = cls.__new__(cls)
        _assert_file_exists(tokens)
@@ -157,6 +169,8 @@ class OnlineRecognizer(object):
            num_threads=num_threads,
            provider=provider,
            model_type=model_type,
+            modeling_unit=modeling_unit,
+            bpe_vocab=bpe_vocab,
            debug=debug,
        )