Encode hotwords in C++ side (#828)

* Encode hotwords in C++ side
This commit is contained in:
Wei Kang
2024-05-20 19:41:36 +08:00
committed by GitHub
parent 8af2af8466
commit b012b78ceb
43 changed files with 714 additions and 102 deletions

View File

@@ -58,6 +58,8 @@ class OnlineRecognizer(object):
hotwords_file: str = "",
provider: str = "cpu",
model_type: str = "",
modeling_unit: str = "cjkchar",
bpe_vocab: str = "",
lm: str = "",
lm_scale: float = 0.1,
temperature_scale: float = 2.0,
@@ -136,6 +138,16 @@ class OnlineRecognizer(object):
model_type:
Online transducer model type. Valid values are: conformer, lstm,
zipformer, zipformer2. All other values lead to loading the model twice.
modeling_unit:
The modeling unit of the model, commonly used units are bpe, cjkchar,
cjkchar+bpe, etc. Currently, it is needed only when hotwords are
provided, we need it to encode the hotwords into token sequence.
bpe_vocab:
The vocabulary generated by google's sentencepiece program.
It is a file has two columns, one is the token, the other is
the log probability, you can get it from the directory where
your bpe model is generated. Only used when hotwords provided
and the modeling unit is bpe or cjkchar+bpe.
"""
self = cls.__new__(cls)
_assert_file_exists(tokens)
@@ -157,6 +169,8 @@ class OnlineRecognizer(object):
num_threads=num_threads,
provider=provider,
model_type=model_type,
modeling_unit=modeling_unit,
bpe_vocab=bpe_vocab,
debug=debug,
)