Add online transducer decoder (#27)

2023-02-19 10:39:07 +08:00
parent 710edaa6f9
commit 0f6f58d1d3
13 changed files with 229 additions and 124 deletions
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@@ -1,9 +1,9 @@
 include_directories(${CMAKE_SOURCE_DIR})

 add_executable(sherpa-onnx
-  decode.cc
  features.cc
  online-lstm-transducer-model.cc
+  online-transducer-greedy-search-decoder.cc
  online-transducer-model-config.cc
  online-transducer-model.cc
  onnx-utils.cc
--- a/sherpa-onnx/csrc/decode.cc
+++ b/sherpa-onnx/csrc/decode.cc
@@ -1,79 +0,0 @@
-// sherpa/csrc/decode.cc
-//
-// Copyright (c)  2023  Xiaomi Corporation
-
-#include "sherpa-onnx/csrc/decode.h"
-
-#include <assert.h>
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-namespace sherpa_onnx {
-
-static Ort::Value Clone(Ort::Value *v) {
-  auto type_and_shape = v->GetTensorTypeAndShapeInfo();
-  std::vector<int64_t> shape = type_and_shape.GetShape();
-
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-
-  return Ort::Value::CreateTensor(memory_info, v->GetTensorMutableData<float>(),
-                                  type_and_shape.GetElementCount(),
-                                  shape.data(), shape.size());
-}
-
-static Ort::Value GetFrame(Ort::Value *encoder_out, int32_t t) {
-  std::vector<int64_t> encoder_out_shape =
-      encoder_out->GetTensorTypeAndShapeInfo().GetShape();
-  assert(encoder_out_shape[0] == 1);
-
-  int32_t encoder_out_dim = encoder_out_shape[2];
-
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-
-  std::array<int64_t, 2> shape{1, encoder_out_dim};
-
-  return Ort::Value::CreateTensor(
-      memory_info,
-      encoder_out->GetTensorMutableData<float>() + t * encoder_out_dim,
-      encoder_out_dim, shape.data(), shape.size());
-}
-
-void GreedySearch(OnlineTransducerModel *model, Ort::Value encoder_out,
-                  std::vector<int64_t> *hyp) {
-  std::vector<int64_t> encoder_out_shape =
-      encoder_out.GetTensorTypeAndShapeInfo().GetShape();
-
-  if (encoder_out_shape[0] > 1) {
-    fprintf(stderr, "Only batch_size=1 is implemented. Given: %d\n",
-            static_cast<int32_t>(encoder_out_shape[0]));
-  }
-
-  int32_t num_frames = encoder_out_shape[1];
-  int32_t vocab_size = model->VocabSize();
-
-  Ort::Value decoder_input = model->BuildDecoderInput(*hyp);
-  Ort::Value decoder_out = model->RunDecoder(std::move(decoder_input));
-
-  for (int32_t t = 0; t != num_frames; ++t) {
-    Ort::Value cur_encoder_out = GetFrame(&encoder_out, t);
-    Ort::Value logit =
-        model->RunJoiner(std::move(cur_encoder_out), Clone(&decoder_out));
-    const float *p_logit = logit.GetTensorData<float>();
-
-    auto y = static_cast<int32_t>(std::distance(
-        static_cast<const float *>(p_logit),
-        std::max_element(static_cast<const float *>(p_logit),
-                         static_cast<const float *>(p_logit) + vocab_size)));
-    if (y != 0) {
-      hyp->push_back(y);
-      decoder_input = model->BuildDecoderInput(*hyp);
-      decoder_out = model->RunDecoder(std::move(decoder_input));
-    }
-  }
-}
-
-}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/decode.h
+++ b/sherpa-onnx/csrc/decode.h
@@ -1,26 +0,0 @@
-// sherpa/csrc/decode.h
-//
-// Copyright (c)  2023  Xiaomi Corporation
-
-#ifndef SHERPA_ONNX_CSRC_DECODE_H_
-#define SHERPA_ONNX_CSRC_DECODE_H_
-
-#include <vector>
-
-#include "sherpa-onnx/csrc/online-transducer-model.h"
-
-namespace sherpa_onnx {
-
-/** Greedy search for non-streaming ASR.
- *
- * @TODO(fangjun) Support batch size > 1
- *
- * @param model  The RnntModel
- * @param encoder_out  Its shape is (1, num_frames, encoder_out_dim).
- */
-void GreedySearch(OnlineTransducerModel *model, Ort::Value encoder_out,
-                  std::vector<int64_t> *hyp);
-
-}  // namespace sherpa_onnx
-
-#endif  // SHERPA_ONNX_CSRC_DECODE_H_
--- a/sherpa-onnx/csrc/features.cc
+++ b/sherpa-onnx/csrc/features.cc
@@ -15,16 +15,16 @@ namespace sherpa_onnx {

 class FeatureExtractor::Impl {
 public:
-  Impl(int32_t sampling_rate, int32_t feature_dim) {
+  explicit Impl(const FeatureExtractorConfig &config) {
    opts_.frame_opts.dither = 0;
    opts_.frame_opts.snip_edges = false;
-    opts_.frame_opts.samp_freq = sampling_rate;
+    opts_.frame_opts.samp_freq = config.sampling_rate;

    // cache 100 seconds of feature frames, which is more than enough
    // for real needs
    opts_.frame_opts.max_feature_vectors = 100 * 100;

-    opts_.mel_opts.num_bins = feature_dim;
+    opts_.mel_opts.num_bins = config.feature_dim;

    fbank_ = std::make_unique<knf::OnlineFbank>(opts_);
  }
@@ -80,9 +80,8 @@ class FeatureExtractor::Impl {
  mutable std::mutex mutex_;
 };

-FeatureExtractor::FeatureExtractor(int32_t sampling_rate /*=16000*/,
-                                   int32_t feature_dim /*=80*/)
-    : impl_(std::make_unique<Impl>(sampling_rate, feature_dim)) {}
+FeatureExtractor::FeatureExtractor(const FeatureExtractorConfig &config /*={}*/)
+    : impl_(std::make_unique<Impl>(config)) {}

 FeatureExtractor::~FeatureExtractor() = default;

--- a/sherpa-onnx/csrc/features.h
+++ b/sherpa-onnx/csrc/features.h
@@ -10,14 +10,18 @@

 namespace sherpa_onnx {

+struct FeatureExtractorConfig {
+  int32_t sampling_rate = 16000;
+  int32_t feature_dim = 80;
+};
+
 class FeatureExtractor {
 public:
  /**
   * @param sampling_rate  Sampling rate of the data used to train the model.
   * @param feature_dim    Dimension of the features used to train the model.
   */
-  explicit FeatureExtractor(int32_t sampling_rate = 16000,
-                            int32_t feature_dim = 80);
+  explicit FeatureExtractor(const FeatureExtractorConfig &config = {});
  ~FeatureExtractor();

  /**
--- a/sherpa-onnx/csrc/online-transducer-decoder.h
+++ b/sherpa-onnx/csrc/online-transducer-decoder.h
@@ -0,0 +1,52 @@
+// sherpa/csrc/online-transducer-decoder.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_DECODER_H_
+#define SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_DECODER_H_
+
+#include <vector>
+
+#include "onnxruntime_cxx_api.h"  // NOLINT
+
+namespace sherpa_onnx {
+
+struct OnlineTransducerDecoderResult {
+  /// The decoded token IDs so far
+  std::vector<int64_t> tokens;
+};
+
+class OnlineTransducerDecoder {
+ public:
+  virtual ~OnlineTransducerDecoder() = default;
+
+  /* Return an empty result.
+   *
+   * To simplify the decoding code, we add `context_size` blanks
+   * to the beginning of the decoding result, which will be
+   * stripped by calling `StripPrecedingBlanks()`.
+   */
+  virtual OnlineTransducerDecoderResult GetEmptyResult() = 0;
+
+  /** Strip blanks added by `GetEmptyResult()`.
+   *
+   * @param r It is changed in-place.
+   */
+  virtual void StripLeadingBlanks(OnlineTransducerDecoderResult * /*r*/) {}
+
+  /** Run transducer beam search given the output from the encoder model.
+   *
+   * @param encoder_out A 3-D tensor of shape (N, T, joiner_dim)
+   * @param result  It is modified in-place.
+   *
+   * @note There is no need to pass encoder_out_length here since for the
+   * online decoding case, each utterance has the same number of frames
+   * and there are no paddings.
+   */
+  virtual void Decode(Ort::Value encoder_out,
+                      std::vector<OnlineTransducerDecoderResult> *result) = 0;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_DECODER_H_
--- a/sherpa-onnx/csrc/online-transducer-greedy-search-decoder.cc
+++ b/sherpa-onnx/csrc/online-transducer-greedy-search-decoder.cc
@@ -0,0 +1,101 @@
+// sherpa/csrc/online-transducer-greedy-search-decoder.cc
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "sherpa-onnx/csrc/onnx-utils.h"
+
+namespace sherpa_onnx {
+
+static Ort::Value GetFrame(Ort::Value *encoder_out, int32_t t) {
+  std::vector<int64_t> encoder_out_shape =
+      encoder_out->GetTensorTypeAndShapeInfo().GetShape();
+  assert(encoder_out_shape[0] == 1);
+
+  int32_t encoder_out_dim = encoder_out_shape[2];
+
+  auto memory_info =
+      Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+  std::array<int64_t, 2> shape{1, encoder_out_dim};
+
+  return Ort::Value::CreateTensor(
+      memory_info,
+      encoder_out->GetTensorMutableData<float>() + t * encoder_out_dim,
+      encoder_out_dim, shape.data(), shape.size());
+}
+
+OnlineTransducerDecoderResult
+OnlineTransducerGreedySearchDecoder::GetEmptyResult() {
+  int32_t context_size = model_->ContextSize();
+  int32_t blank_id = 0;  // always 0
+  OnlineTransducerDecoderResult r;
+  r.tokens.resize(context_size, blank_id);
+
+  return r;
+}
+
+void OnlineTransducerGreedySearchDecoder::StripLeadingBlanks(
+    OnlineTransducerDecoderResult *r) {
+  int32_t context_size = model_->ContextSize();
+
+  auto start = r->tokens.begin() + context_size;
+  auto end = r->tokens.end();
+
+  r->tokens = std::vector<int64_t>(start, end);
+}
+
+void OnlineTransducerGreedySearchDecoder::Decode(
+    Ort::Value encoder_out,
+    std::vector<OnlineTransducerDecoderResult> *result) {
+  std::vector<int64_t> encoder_out_shape =
+      encoder_out.GetTensorTypeAndShapeInfo().GetShape();
+
+  if (encoder_out_shape[0] != result->size()) {
+    fprintf(stderr,
+            "Size mismatch! encoder_out.size(0) %d, result.size(0): %d\n",
+            static_cast<int32_t>(encoder_out_shape[0]),
+            static_cast<int32_t>(result->size()));
+    exit(-1);
+  }
+
+  if (result->size() != 1) {
+    fprintf(stderr, "only batch size == 1 is implemented. Given: %d",
+            static_cast<int32_t>(result->size()));
+    exit(-1);
+  }
+
+  auto &hyp = (*result)[0].tokens;
+
+  int32_t num_frames = encoder_out_shape[1];
+  int32_t vocab_size = model_->VocabSize();
+
+  Ort::Value decoder_input = model_->BuildDecoderInput(hyp);
+  Ort::Value decoder_out = model_->RunDecoder(std::move(decoder_input));
+
+  for (int32_t t = 0; t != num_frames; ++t) {
+    Ort::Value cur_encoder_out = GetFrame(&encoder_out, t);
+    Ort::Value logit =
+        model_->RunJoiner(std::move(cur_encoder_out), Clone(&decoder_out));
+    const float *p_logit = logit.GetTensorData<float>();
+
+    auto y = static_cast<int32_t>(std::distance(
+        static_cast<const float *>(p_logit),
+        std::max_element(static_cast<const float *>(p_logit),
+                         static_cast<const float *>(p_logit) + vocab_size)));
+    if (y != 0) {
+      hyp.push_back(y);
+      decoder_input = model_->BuildDecoderInput(hyp);
+      decoder_out = model_->RunDecoder(std::move(decoder_input));
+    }
+  }
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h
+++ b/sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h
@@ -0,0 +1,33 @@
+// sherpa/csrc/online-transducer-greedy-search-decoder.h
+//
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_
+#define SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_
+
+#include <vector>
+
+#include "sherpa-onnx/csrc/online-transducer-decoder.h"
+#include "sherpa-onnx/csrc/online-transducer-model.h"
+
+namespace sherpa_onnx {
+
+class OnlineTransducerGreedySearchDecoder : public OnlineTransducerDecoder {
+ public:
+  explicit OnlineTransducerGreedySearchDecoder(OnlineTransducerModel *model)
+      : model_(model) {}
+
+  OnlineTransducerDecoderResult GetEmptyResult() override;
+
+  void StripLeadingBlanks(OnlineTransducerDecoderResult *r) override;
+
+  void Decode(Ort::Value encoder_out,
+              std::vector<OnlineTransducerDecoderResult> *result) override;
+
+ private:
+  OnlineTransducerModel *model_;  // Not owned
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_GREEDY_SEARCH_DECODER_H_
--- a/sherpa-onnx/csrc/onnx-utils.cc
+++ b/sherpa-onnx/csrc/onnx-utils.cc
@@ -46,4 +46,16 @@ void PrintModelMetadata(std::ostream &os, const Ort::ModelMetadata &meta_data) {
  }
 }

+Ort::Value Clone(Ort::Value *v) {
+  auto type_and_shape = v->GetTensorTypeAndShapeInfo();
+  std::vector<int64_t> shape = type_and_shape.GetShape();
+
+  auto memory_info =
+      Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+  return Ort::Value::CreateTensor(memory_info, v->GetTensorMutableData<float>(),
+                                  type_and_shape.GetElementCount(),
+                                  shape.data(), shape.size());
+}
+
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/onnx-utils.h
+++ b/sherpa-onnx/csrc/onnx-utils.h
@@ -55,6 +55,9 @@ void GetOutputNames(Ort::Session *sess, std::vector<std::string> *output_names,
 void PrintModelMetadata(std::ostream &os,
                        const Ort::ModelMetadata &meta_data);  // NOLINT

+// Return a shallow copy of v
+Ort::Value Clone(Ort::Value *v);
+
 }  // namespace sherpa_onnx

 #endif  // SHERPA_ONNX_CSRC_ONNX_UTILS_H_
--- a/sherpa-onnx/csrc/sherpa-onnx.cc
+++ b/sherpa-onnx/csrc/sherpa-onnx.cc
@@ -9,8 +9,8 @@
 #include <vector>

 #include "kaldi-native-fbank/csrc/online-feature.h"
-#include "sherpa-onnx/csrc/decode.h"
 #include "sherpa-onnx/csrc/features.h"
+#include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"
 #include "sherpa-onnx/csrc/online-transducer-model-config.h"
 #include "sherpa-onnx/csrc/online-transducer-model.h"
 #include "sherpa-onnx/csrc/symbol-table.h"
@@ -64,8 +64,6 @@ for a list of pre-trained models to download.

  std::vector<Ort::Value> states = model->GetEncoderInitStates();

-  std::vector<int64_t> hyp(model->ContextSize(), 0);
-
  int32_t expected_sampling_rate = 16000;

  bool is_ok = false;
@@ -100,6 +98,10 @@ for a list of pre-trained models to download.

  std::array<int64_t, 3> x_shape{1, chunk_size, feature_dim};

+  sherpa_onnx::OnlineTransducerGreedySearchDecoder decoder(model.get());
+  std::vector<sherpa_onnx::OnlineTransducerDecoderResult> result = {
+      decoder.GetEmptyResult()};
+
  for (int32_t start = 0; start + chunk_size < num_frames;
       start += chunk_shift) {
    std::vector<float> features = feat_extractor.GetFrames(start, chunk_size);
@@ -109,8 +111,10 @@ for a list of pre-trained models to download.
                                 x_shape.data(), x_shape.size());
    auto pair = model->RunEncoder(std::move(x), states);
    states = std::move(pair.second);
-    sherpa_onnx::GreedySearch(model.get(), std::move(pair.first), &hyp);
+    decoder.Decode(std::move(pair.first), &result);
  }
+  decoder.StripLeadingBlanks(&result[0]);
+  const auto &hyp = result[0].tokens;
  std::string text;
  for (size_t i = model->ContextSize(); i != hyp.size(); ++i) {
    text += sym[hyp[i]];