Add inverse text normalization for online ASR (#1020)

This commit is contained in:
Fangjun Kuang
2024-06-17 18:39:23 +08:00
committed by GitHub
parent 6e09933d99
commit 349d957da2
12 changed files with 390 additions and 32 deletions

View File

@@ -14,7 +14,9 @@
#include <utility>
#include <vector>
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/online-recognizer-impl.h"
#include "sherpa-onnx/csrc/text-utils.h"
namespace sherpa_onnx {
@@ -100,6 +102,15 @@ void OnlineRecognizerConfig::Register(ParseOptions *po) {
"now support greedy_search and modified_beam_search.");
po->Register("temperature-scale", &temperature_scale,
"Temperature scale for confidence computation in decoding.");
po->Register(
"rule-fsts", &rule_fsts,
"If not empty, it specifies fsts for inverse text normalization. "
"If there are multiple fsts, they are separated by a comma.");
po->Register(
"rule-fars", &rule_fars,
"If not empty, it specifies fst archives for inverse text normalization. "
"If there are multiple archives, they are separated by a comma.");
}
bool OnlineRecognizerConfig::Validate() const {
@@ -129,6 +140,34 @@ bool OnlineRecognizerConfig::Validate() const {
return false;
}
if (!hotwords_file.empty() && !FileExists(hotwords_file)) {
SHERPA_ONNX_LOGE("--hotwords-file: '%s' does not exist",
hotwords_file.c_str());
return false;
}
if (!rule_fsts.empty()) {
std::vector<std::string> files;
SplitStringToVector(rule_fsts, ",", false, &files);
for (const auto &f : files) {
if (!FileExists(f)) {
SHERPA_ONNX_LOGE("Rule fst '%s' does not exist. ", f.c_str());
return false;
}
}
}
if (!rule_fars.empty()) {
std::vector<std::string> files;
SplitStringToVector(rule_fars, ",", false, &files);
for (const auto &f : files) {
if (!FileExists(f)) {
SHERPA_ONNX_LOGE("Rule far '%s' does not exist. ", f.c_str());
return false;
}
}
}
return model_config.Validate();
}
@@ -147,7 +186,9 @@ std::string OnlineRecognizerConfig::ToString() const {
os << "hotwords_file=\"" << hotwords_file << "\", ";
os << "decoding_method=\"" << decoding_method << "\", ";
os << "blank_penalty=" << blank_penalty << ", ";
os << "temperature_scale=" << temperature_scale << ")";
os << "temperature_scale=" << temperature_scale << ", ";
os << "rule_fsts=\"" << rule_fsts << "\", ";
os << "rule_fars=\"" << rule_fars << "\")";
return os.str();
}