Add inverse text normalization for online ASR (#1020)
This commit is contained in:
@@ -68,7 +68,8 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src,
|
||||
class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
|
||||
public:
|
||||
explicit OnlineRecognizerCtcImpl(const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(config),
|
||||
config_(config),
|
||||
model_(OnlineCtcModel::Create(config.model_config)),
|
||||
sym_(config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config) {
|
||||
@@ -84,7 +85,8 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
|
||||
#if __ANDROID_API__ >= 9
|
||||
explicit OnlineRecognizerCtcImpl(AAssetManager *mgr,
|
||||
const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(mgr, config),
|
||||
config_(config),
|
||||
model_(OnlineCtcModel::Create(mgr, config.model_config)),
|
||||
sym_(mgr, config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config) {
|
||||
@@ -182,8 +184,10 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
|
||||
// TODO(fangjun): Remember to change these constants if needed
|
||||
int32_t frame_shift_ms = 10;
|
||||
int32_t subsampling_factor = 4;
|
||||
return Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
|
||||
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
|
||||
auto r = Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
|
||||
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
|
||||
r.text = ApplyInverseTextNormalization(r.text);
|
||||
return r;
|
||||
}
|
||||
|
||||
bool IsEndpoint(OnlineStream *s) const override {
|
||||
|
||||
@@ -4,11 +4,22 @@
|
||||
|
||||
#include "sherpa-onnx/csrc/online-recognizer-impl.h"
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include <strstream>
|
||||
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#include "fst/extensions/far/far.h"
|
||||
#include "kaldifst/csrc/kaldi-fst-io.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/online-recognizer-ctc-impl.h"
|
||||
#include "sherpa-onnx/csrc/online-recognizer-paraformer-impl.h"
|
||||
#include "sherpa-onnx/csrc/online-recognizer-transducer-impl.h"
|
||||
#include "sherpa-onnx/csrc/online-recognizer-transducer-nemo-impl.h"
|
||||
#include "sherpa-onnx/csrc/onnx-utils.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
@@ -78,4 +89,110 @@ std::unique_ptr<OnlineRecognizerImpl> OnlineRecognizerImpl::Create(
|
||||
}
|
||||
#endif
|
||||
|
||||
OnlineRecognizerImpl::OnlineRecognizerImpl(const OnlineRecognizerConfig &config)
|
||||
: config_(config) {
|
||||
if (!config.rule_fsts.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fsts, ",", false, &files);
|
||||
itn_list_.reserve(files.size());
|
||||
for (const auto &f : files) {
|
||||
if (config.model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
|
||||
}
|
||||
itn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(f));
|
||||
}
|
||||
}
|
||||
|
||||
if (!config.rule_fars.empty()) {
|
||||
if (config.model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("Loading FST archives");
|
||||
}
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fars, ",", false, &files);
|
||||
|
||||
itn_list_.reserve(files.size() + itn_list_.size());
|
||||
|
||||
for (const auto &f : files) {
|
||||
if (config.model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
|
||||
}
|
||||
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
|
||||
fst::FarReader<fst::StdArc>::Open(f));
|
||||
for (; !reader->Done(); reader->Next()) {
|
||||
std::unique_ptr<fst::StdConstFst> r(
|
||||
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
|
||||
|
||||
itn_list_.push_back(
|
||||
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
|
||||
}
|
||||
}
|
||||
|
||||
if (config.model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("FST archives loaded!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
OnlineRecognizerImpl::OnlineRecognizerImpl(AAssetManager *mgr,
|
||||
const OnlineRecognizerConfig &config)
|
||||
: config_(config) {
|
||||
if (!config.rule_fsts.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fsts, ",", false, &files);
|
||||
itn_list_.reserve(files.size());
|
||||
for (const auto &f : files) {
|
||||
if (config.model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("rule fst: %s", f.c_str());
|
||||
}
|
||||
auto buf = ReadFile(mgr, f);
|
||||
std::istrstream is(buf.data(), buf.size());
|
||||
itn_list_.push_back(std::make_unique<kaldifst::TextNormalizer>(is));
|
||||
}
|
||||
}
|
||||
|
||||
if (!config.rule_fars.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(config.rule_fars, ",", false, &files);
|
||||
itn_list_.reserve(files.size() + itn_list_.size());
|
||||
|
||||
for (const auto &f : files) {
|
||||
if (config.model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("rule far: %s", f.c_str());
|
||||
}
|
||||
|
||||
auto buf = ReadFile(mgr, f);
|
||||
|
||||
std::unique_ptr<std::istream> s(
|
||||
new std::istrstream(buf.data(), buf.size()));
|
||||
|
||||
std::unique_ptr<fst::FarReader<fst::StdArc>> reader(
|
||||
fst::FarReader<fst::StdArc>::Open(std::move(s)));
|
||||
|
||||
for (; !reader->Done(); reader->Next()) {
|
||||
std::unique_ptr<fst::StdConstFst> r(
|
||||
fst::CastOrConvertToConstFst(reader->GetFst()->Copy()));
|
||||
|
||||
itn_list_.push_back(
|
||||
std::make_unique<kaldifst::TextNormalizer>(std::move(r)));
|
||||
} // for (; !reader->Done(); reader->Next())
|
||||
} // for (const auto &f : files)
|
||||
} // if (!config.rule_fars.empty())
|
||||
}
|
||||
#endif
|
||||
|
||||
std::string OnlineRecognizerImpl::ApplyInverseTextNormalization(
|
||||
std::string text) const {
|
||||
if (!itn_list_.empty()) {
|
||||
for (const auto &tn : itn_list_) {
|
||||
text = tn->Normalize(text);
|
||||
if (config_.model_config.debug) {
|
||||
SHERPA_ONNX_LOGE("After inverse text normalization: %s", text.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -9,6 +9,12 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
#include "android/asset_manager.h"
|
||||
#include "android/asset_manager_jni.h"
|
||||
#endif
|
||||
|
||||
#include "kaldifst/csrc/text-normalizer.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "sherpa-onnx/csrc/online-recognizer.h"
|
||||
#include "sherpa-onnx/csrc/online-stream.h"
|
||||
@@ -17,10 +23,15 @@ namespace sherpa_onnx {
|
||||
|
||||
class OnlineRecognizerImpl {
|
||||
public:
|
||||
explicit OnlineRecognizerImpl(const OnlineRecognizerConfig &config);
|
||||
|
||||
static std::unique_ptr<OnlineRecognizerImpl> Create(
|
||||
const OnlineRecognizerConfig &config);
|
||||
|
||||
#if __ANDROID_API__ >= 9
|
||||
OnlineRecognizerImpl(AAssetManager *mgr,
|
||||
const OnlineRecognizerConfig &config);
|
||||
|
||||
static std::unique_ptr<OnlineRecognizerImpl> Create(
|
||||
AAssetManager *mgr, const OnlineRecognizerConfig &config);
|
||||
#endif
|
||||
@@ -50,6 +61,15 @@ class OnlineRecognizerImpl {
|
||||
virtual bool IsEndpoint(OnlineStream *s) const = 0;
|
||||
|
||||
virtual void Reset(OnlineStream *s) const = 0;
|
||||
|
||||
std::string ApplyInverseTextNormalization(std::string text) const;
|
||||
|
||||
private:
|
||||
OnlineRecognizerConfig config_;
|
||||
// for inverse text normalization. Used only if
|
||||
// config.rule_fsts is not empty or
|
||||
// config.rule_fars is not empty
|
||||
std::vector<std::unique_ptr<kaldifst::TextNormalizer>> itn_list_;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
@@ -96,7 +96,8 @@ static void Scale(const float *x, int32_t n, float scale, float *y) {
|
||||
class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl {
|
||||
public:
|
||||
explicit OnlineRecognizerParaformerImpl(const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(config),
|
||||
config_(config),
|
||||
model_(config.model_config),
|
||||
sym_(config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config) {
|
||||
@@ -116,7 +117,8 @@ class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl {
|
||||
#if __ANDROID_API__ >= 9
|
||||
explicit OnlineRecognizerParaformerImpl(AAssetManager *mgr,
|
||||
const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(mgr, config),
|
||||
config_(config),
|
||||
model_(mgr, config.model_config),
|
||||
sym_(mgr, config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config) {
|
||||
@@ -160,7 +162,9 @@ class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl {
|
||||
OnlineRecognizerResult GetResult(OnlineStream *s) const override {
|
||||
auto decoder_result = s->GetParaformerResult();
|
||||
|
||||
return Convert(decoder_result, sym_);
|
||||
auto r = Convert(decoder_result, sym_);
|
||||
r.text = ApplyInverseTextNormalization(r.text);
|
||||
return r;
|
||||
}
|
||||
|
||||
bool IsEndpoint(OnlineStream *s) const override {
|
||||
|
||||
@@ -80,7 +80,8 @@ OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src,
|
||||
class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
|
||||
public:
|
||||
explicit OnlineRecognizerTransducerImpl(const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(config),
|
||||
config_(config),
|
||||
model_(OnlineTransducerModel::Create(config.model_config)),
|
||||
sym_(config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config) {
|
||||
@@ -124,7 +125,8 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
|
||||
#if __ANDROID_API__ >= 9
|
||||
explicit OnlineRecognizerTransducerImpl(AAssetManager *mgr,
|
||||
const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(mgr, config),
|
||||
config_(config),
|
||||
model_(OnlineTransducerModel::Create(mgr, config.model_config)),
|
||||
sym_(mgr, config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config) {
|
||||
@@ -332,8 +334,10 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl {
|
||||
// TODO(fangjun): Remember to change these constants if needed
|
||||
int32_t frame_shift_ms = 10;
|
||||
int32_t subsampling_factor = 4;
|
||||
return Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
|
||||
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
|
||||
auto r = Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor,
|
||||
s->GetCurrentSegment(), s->GetNumFramesSinceStart());
|
||||
r.text = ApplyInverseTextNormalization(std::move(r.text));
|
||||
return r;
|
||||
}
|
||||
|
||||
bool IsEndpoint(OnlineStream *s) const override {
|
||||
|
||||
@@ -42,7 +42,8 @@ class OnlineRecognizerTransducerNeMoImpl : public OnlineRecognizerImpl {
|
||||
public:
|
||||
explicit OnlineRecognizerTransducerNeMoImpl(
|
||||
const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(config),
|
||||
config_(config),
|
||||
symbol_table_(config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config),
|
||||
model_(
|
||||
@@ -61,7 +62,8 @@ class OnlineRecognizerTransducerNeMoImpl : public OnlineRecognizerImpl {
|
||||
#if __ANDROID_API__ >= 9
|
||||
explicit OnlineRecognizerTransducerNeMoImpl(
|
||||
AAssetManager *mgr, const OnlineRecognizerConfig &config)
|
||||
: config_(config),
|
||||
: OnlineRecognizerImpl(mgr, config),
|
||||
config_(config),
|
||||
symbol_table_(mgr, config.model_config.tokens),
|
||||
endpoint_(config_.endpoint_config),
|
||||
model_(std::make_unique<OnlineTransducerNeMoModel>(
|
||||
@@ -94,9 +96,11 @@ class OnlineRecognizerTransducerNeMoImpl : public OnlineRecognizerImpl {
|
||||
// TODO(fangjun): Remember to change these constants if needed
|
||||
int32_t frame_shift_ms = 10;
|
||||
int32_t subsampling_factor = model_->SubsamplingFactor();
|
||||
return Convert(s->GetResult(), symbol_table_, frame_shift_ms,
|
||||
subsampling_factor, s->GetCurrentSegment(),
|
||||
s->GetNumFramesSinceStart());
|
||||
auto r = Convert(s->GetResult(), symbol_table_, frame_shift_ms,
|
||||
subsampling_factor, s->GetCurrentSegment(),
|
||||
s->GetNumFramesSinceStart());
|
||||
r.text = ApplyInverseTextNormalization(std::move(r.text));
|
||||
return r;
|
||||
}
|
||||
|
||||
bool IsEndpoint(OnlineStream *s) const override {
|
||||
|
||||
@@ -14,7 +14,9 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "sherpa-onnx/csrc/file-utils.h"
|
||||
#include "sherpa-onnx/csrc/online-recognizer-impl.h"
|
||||
#include "sherpa-onnx/csrc/text-utils.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
@@ -100,6 +102,15 @@ void OnlineRecognizerConfig::Register(ParseOptions *po) {
|
||||
"now support greedy_search and modified_beam_search.");
|
||||
po->Register("temperature-scale", &temperature_scale,
|
||||
"Temperature scale for confidence computation in decoding.");
|
||||
po->Register(
|
||||
"rule-fsts", &rule_fsts,
|
||||
"If not empty, it specifies fsts for inverse text normalization. "
|
||||
"If there are multiple fsts, they are separated by a comma.");
|
||||
|
||||
po->Register(
|
||||
"rule-fars", &rule_fars,
|
||||
"If not empty, it specifies fst archives for inverse text normalization. "
|
||||
"If there are multiple archives, they are separated by a comma.");
|
||||
}
|
||||
|
||||
bool OnlineRecognizerConfig::Validate() const {
|
||||
@@ -129,6 +140,34 @@ bool OnlineRecognizerConfig::Validate() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!hotwords_file.empty() && !FileExists(hotwords_file)) {
|
||||
SHERPA_ONNX_LOGE("--hotwords-file: '%s' does not exist",
|
||||
hotwords_file.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!rule_fsts.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(rule_fsts, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
if (!FileExists(f)) {
|
||||
SHERPA_ONNX_LOGE("Rule fst '%s' does not exist. ", f.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!rule_fars.empty()) {
|
||||
std::vector<std::string> files;
|
||||
SplitStringToVector(rule_fars, ",", false, &files);
|
||||
for (const auto &f : files) {
|
||||
if (!FileExists(f)) {
|
||||
SHERPA_ONNX_LOGE("Rule far '%s' does not exist. ", f.c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return model_config.Validate();
|
||||
}
|
||||
|
||||
@@ -147,7 +186,9 @@ std::string OnlineRecognizerConfig::ToString() const {
|
||||
os << "hotwords_file=\"" << hotwords_file << "\", ";
|
||||
os << "decoding_method=\"" << decoding_method << "\", ";
|
||||
os << "blank_penalty=" << blank_penalty << ", ";
|
||||
os << "temperature_scale=" << temperature_scale << ")";
|
||||
os << "temperature_scale=" << temperature_scale << ", ";
|
||||
os << "rule_fsts=\"" << rule_fsts << "\", ";
|
||||
os << "rule_fars=\"" << rule_fars << "\")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
@@ -100,6 +100,12 @@ struct OnlineRecognizerConfig {
|
||||
|
||||
float temperature_scale = 2.0;
|
||||
|
||||
// If there are multiple rules, they are applied from left to right.
|
||||
std::string rule_fsts;
|
||||
|
||||
// If there are multiple FST archives, they are applied from left to right.
|
||||
std::string rule_fars;
|
||||
|
||||
OnlineRecognizerConfig() = default;
|
||||
|
||||
OnlineRecognizerConfig(
|
||||
@@ -109,7 +115,8 @@ struct OnlineRecognizerConfig {
|
||||
const OnlineCtcFstDecoderConfig &ctc_fst_decoder_config,
|
||||
bool enable_endpoint, const std::string &decoding_method,
|
||||
int32_t max_active_paths, const std::string &hotwords_file,
|
||||
float hotwords_score, float blank_penalty, float temperature_scale)
|
||||
float hotwords_score, float blank_penalty, float temperature_scale,
|
||||
const std::string &rule_fsts, const std::string &rule_fars)
|
||||
: feat_config(feat_config),
|
||||
model_config(model_config),
|
||||
lm_config(lm_config),
|
||||
@@ -121,7 +128,9 @@ struct OnlineRecognizerConfig {
|
||||
hotwords_file(hotwords_file),
|
||||
hotwords_score(hotwords_score),
|
||||
blank_penalty(blank_penalty),
|
||||
temperature_scale(temperature_scale) {}
|
||||
temperature_scale(temperature_scale),
|
||||
rule_fsts(rule_fsts),
|
||||
rule_fars(rule_fars) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
Reference in New Issue
Block a user