Encode hotwords in C++ side (#828)

* Encode hotwords in C++ side
This commit is contained in:
Wei Kang
2024-05-20 19:41:36 +08:00
committed by GitHub
parent 8af2af8466
commit b012b78ceb
43 changed files with 714 additions and 102 deletions

View File

@@ -35,6 +35,17 @@ void OfflineModelConfig::Register(ParseOptions *po) {
"Valid values are: transducer, paraformer, nemo_ctc, whisper, "
"tdnn, zipformer2_ctc"
"All other values lead to loading the model twice.");
po->Register("modeling-unit", &modeling_unit,
"The modeling unit of the model, commonly used units are bpe, "
"cjkchar, cjkchar+bpe, etc. Currently, it is needed only when "
"hotwords are provided, we need it to encode the hotwords into "
"token sequence.");
po->Register("bpe-vocab", &bpe_vocab,
"The vocabulary generated by google's sentencepiece program. "
"It is a file has two columns, one is the token, the other is "
"the log probability, you can get it from the directory where "
"your bpe model is generated. Only used when hotwords provided "
"and the modeling unit is bpe or cjkchar+bpe");
}
bool OfflineModelConfig::Validate() const {
@@ -48,6 +59,14 @@ bool OfflineModelConfig::Validate() const {
return false;
}
if (!modeling_unit.empty() &&
(modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) {
if (!FileExists(bpe_vocab)) {
SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str());
return false;
}
}
if (!paraformer.model.empty()) {
return paraformer.Validate();
}
@@ -90,7 +109,9 @@ std::string OfflineModelConfig::ToString() const {
os << "num_threads=" << num_threads << ", ";
os << "debug=" << (debug ? "True" : "False") << ", ";
os << "provider=\"" << provider << "\", ";
os << "model_type=\"" << model_type << "\")";
os << "model_type=\"" << model_type << "\", ";
os << "modeling_unit=\"" << modeling_unit << "\", ";
os << "bpe_vocab=\"" << bpe_vocab << "\")";
return os.str();
}