Add inverse text normalization for non-streaming ASR (#1017)

This commit is contained in:
Fangjun Kuang
2024-06-17 14:28:53 +08:00
committed by GitHub
parent dd69a1b56b
commit b0f7ed3ee3
13 changed files with 380 additions and 19 deletions

View File

@@ -10,7 +10,7 @@
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-lm-config.h"
#include "sherpa-onnx/csrc/offline-recognizer-impl.h"
#include "sherpa-onnx/csrc/text-utils.h"
namespace sherpa_onnx {
void OfflineRecognizerConfig::Register(ParseOptions *po) {
@@ -44,6 +44,16 @@ void OfflineRecognizerConfig::Register(ParseOptions *po) {
po->Register("hotwords-score", &hotwords_score,
"The bonus score for each token in context word/phrase. "
"Used only when decoding_method is modified_beam_search");
po->Register(
"rule-fsts", &rule_fsts,
"If not empty, it specifies fsts for inverse text normalization. "
"If there are multiple fsts, they are separated by a comma.");
po->Register(
"rule-fars", &rule_fars,
"If not empty, it specifies fst archives for inverse text normalization. "
"If there are multiple archives, they are separated by a comma.");
}
bool OfflineRecognizerConfig::Validate() const {
@@ -61,7 +71,7 @@ bool OfflineRecognizerConfig::Validate() const {
if (!hotwords_file.empty() && decoding_method != "modified_beam_search") {
SHERPA_ONNX_LOGE(
"Please use --decoding-method=modified_beam_search if you"
" provide --hotwords-file. Given --decoding-method=%s",
" provide --hotwords-file. Given --decoding-method='%s'",
decoding_method.c_str());
return false;
}
@@ -72,6 +82,34 @@ bool OfflineRecognizerConfig::Validate() const {
return false;
}
if (!hotwords_file.empty() && !FileExists(hotwords_file)) {
SHERPA_ONNX_LOGE("--hotwords-file: '%s' does not exist",
hotwords_file.c_str());
return false;
}
if (!rule_fsts.empty()) {
std::vector<std::string> files;
SplitStringToVector(rule_fsts, ",", false, &files);
for (const auto &f : files) {
if (!FileExists(f)) {
SHERPA_ONNX_LOGE("Rule fst '%s' does not exist. ", f.c_str());
return false;
}
}
}
if (!rule_fars.empty()) {
std::vector<std::string> files;
SplitStringToVector(rule_fars, ",", false, &files);
for (const auto &f : files) {
if (!FileExists(f)) {
SHERPA_ONNX_LOGE("Rule far '%s' does not exist. ", f.c_str());
return false;
}
}
}
return model_config.Validate();
}
@@ -87,7 +125,9 @@ std::string OfflineRecognizerConfig::ToString() const {
os << "max_active_paths=" << max_active_paths << ", ";
os << "hotwords_file=\"" << hotwords_file << "\", ";
os << "hotwords_score=" << hotwords_score << ", ";
os << "blank_penalty=" << blank_penalty << ")";
os << "blank_penalty=" << blank_penalty << ", ";
os << "rule_fsts=\"" << rule_fsts << "\", ";
os << "rule_fars=\"" << rule_fars << "\")";
return os.str();
}