add modified beam search (#69)

2023-03-01 15:32:54 +08:00
parent e0b76655c8
commit 5326d0f81f
19 changed files with 614 additions and 87 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,4 @@ decode-file
 tokens.txt
 *.onnx
 log.txt
+tags
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@@ -5,11 +5,13 @@ set(sources
  endpoint.cc
  features.cc
  file-utils.cc
+  hypothesis.cc
  online-lstm-transducer-model.cc
  online-recognizer.cc
  online-stream.cc
  online-transducer-greedy-search-decoder.cc
  online-transducer-model-config.cc
+  online-transducer-modified-beam-search-decoder.cc
  online-transducer-model.cc
  online-zipformer-transducer-model.cc
  onnx-utils.cc
--- a/sherpa-onnx/csrc/hypothesis.cc
+++ b/sherpa-onnx/csrc/hypothesis.cc
@@ -0,0 +1,65 @@
+/**
+ * Copyright (c)  2023  Xiaomi Corporation
+ *
+ */
+
+#include "sherpa-onnx/csrc/hypothesis.h"
+
+#include <algorithm>
+#include <utility>
+
+namespace sherpa_onnx {
+
+void Hypotheses::Add(Hypothesis hyp) {
+  auto key = hyp.Key();
+  auto it = hyps_dict_.find(key);
+  if (it == hyps_dict_.end()) {
+    hyps_dict_[key] = std::move(hyp);
+  } else {
+    it->second.log_prob = LogAdd<double>()(it->second.log_prob, hyp.log_prob);
+  }
+}
+
+Hypothesis Hypotheses::GetMostProbable(bool length_norm) const {
+  if (length_norm == false) {
+    return std::max_element(hyps_dict_.begin(), hyps_dict_.end(),
+                            [](const auto &left, auto &right) -> bool {
+                              return left.second.log_prob <
+                                     right.second.log_prob;
+                            })
+        ->second;
+  } else {
+    // for length_norm is true
+    return std::max_element(
+               hyps_dict_.begin(), hyps_dict_.end(),
+               [](const auto &left, const auto &right) -> bool {
+                 return left.second.log_prob / left.second.ys.size() <
+                        right.second.log_prob / right.second.ys.size();
+               })
+        ->second;
+  }
+}
+
+std::vector<Hypothesis> Hypotheses::GetTopK(int32_t k, bool length_norm) const {
+  k = std::max(k, 1);
+  k = std::min(k, Size());
+
+  std::vector<Hypothesis> all_hyps = Vec();
+
+  if (length_norm == false) {
+    std::partial_sort(
+        all_hyps.begin(), all_hyps.begin() + k, all_hyps.end(),
+        [](const auto &a, const auto &b) { return a.log_prob > b.log_prob; });
+  } else {
+    // for length_norm is true
+    std::partial_sort(all_hyps.begin(), all_hyps.begin() + k, all_hyps.end(),
+                      [](const auto &a, const auto &b) {
+                        return a.log_prob / a.ys.size() >
+                               b.log_prob / b.ys.size();
+                      });
+  }
+
+  return {all_hyps.begin(), all_hyps.begin() + k};
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/hypothesis.h
+++ b/sherpa-onnx/csrc/hypothesis.h
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c)  2023  Xiaomi Corporation
+ *
+ */
+
+#ifndef SHERPA_ONNX_CSRC_HYPOTHESIS_H_
+#define SHERPA_ONNX_CSRC_HYPOTHESIS_H_
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "sherpa-onnx/csrc/math.h"
+
+namespace sherpa_onnx {
+
+struct Hypothesis {
+  // The predicted tokens so far. Newly predicated tokens are appended.
+  std::vector<int32_t> ys;
+
+  // timestamps[i] contains the frame number after subsampling
+  // on which ys[i] is decoded.
+  std::vector<int32_t> timestamps;
+
+  // The total score of ys in log space.
+  double log_prob = 0;
+
+  int32_t num_trailing_blanks = 0;
+
+  Hypothesis() = default;
+  Hypothesis(const std::vector<int32_t> &ys, double log_prob)
+      : ys(ys), log_prob(log_prob) {}
+
+  // If two Hypotheses have the same `Key`, then they contain
+  // the same token sequence.
+  std::string Key() const {
+    // TODO(fangjun): Use a hash function?
+    std::ostringstream os;
+    std::string sep = "-";
+    for (auto i : ys) {
+      os << i << sep;
+      sep = "-";
+    }
+    return os.str();
+  }
+
+  // For debugging
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "(" << Key() << ", " << log_prob << ")";
+    return os.str();
+  }
+};
+
+class Hypotheses {
+ public:
+  Hypotheses() = default;
+
+  explicit Hypotheses(std::vector<Hypothesis> hyps) {
+    for (auto &h : hyps) {
+      hyps_dict_[h.Key()] = std::move(h);
+    }
+  }
+
+  explicit Hypotheses(std::unordered_map<std::string, Hypothesis> hyps_dict)
+      : hyps_dict_(std::move(hyps_dict)) {}
+
+  // Add hyp to this object. If it already exists, its log_prob
+  // is updated with the given hyp using log-sum-exp.
+  void Add(Hypothesis hyp);
+
+  // Get the hyp that has the largest log_prob.
+  // If length_norm is true, hyp's log_prob is divided by
+  // len(hyp.ys) before comparison.
+  Hypothesis GetMostProbable(bool length_norm) const;
+
+  // Get the k hyps that have the largest log_prob.
+  // If length_norm is true, hyp's log_prob is divided by
+  // len(hyp.ys) before comparison.
+  std::vector<Hypothesis> GetTopK(int32_t k, bool length_norm) const;
+
+  int32_t Size() const { return hyps_dict_.size(); }
+
+  std::string ToString() const {
+    std::ostringstream os;
+    for (const auto &p : hyps_dict_) {
+      os << p.second.ToString() << "\n";
+    }
+    return os.str();
+  }
+
+  const auto begin() const { return hyps_dict_.begin(); }
+  const auto end() const { return hyps_dict_.end(); }
+
+  void Clear() { hyps_dict_.clear(); }
+
+ private:
+  // Return a list of hyps contained in this object.
+  std::vector<Hypothesis> Vec() const {
+    std::vector<Hypothesis> ans;
+    ans.reserve(hyps_dict_.size());
+    for (const auto &p : hyps_dict_) {
+      ans.push_back(p.second);
+    }
+    return ans;
+  }
+
+ private:
+  using Map = std ::unordered_map<std::string, Hypothesis>;
+  Map hyps_dict_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_HYPOTHESIS_H_
--- a/sherpa-onnx/csrc/math.h
+++ b/sherpa-onnx/csrc/math.h
@@ -0,0 +1,107 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Daniel Povey)
+ * Copyright (c)  2023                     (Pingfeng Luo)
+ *
+ */
+// This file is copied from k2/csrc/utils.h
+#ifndef SHERPA_ONNX_CSRC_MATH_H_
+#define SHERPA_ONNX_CSRC_MATH_H_
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+namespace sherpa_onnx {
+
+// logf(FLT_EPSILON)
+#define SHERPA_ONNX_MIN_LOG_DIFF_FLOAT -15.9423847198486328125f
+
+// log(DBL_EPSILON)
+#define SHERPA_ONNX_MIN_LOG_DIFF_DOUBLE \
+  -36.0436533891171535515240975655615329742431640625
+
+template <typename T>
+struct LogAdd;
+
+template <>
+struct LogAdd<double> {
+  double operator()(double x, double y) const {
+    double diff;
+
+    if (x < y) {
+      diff = x - y;
+      x = y;
+    } else {
+      diff = y - x;
+    }
+    // diff is negative.  x is now the larger one.
+
+    if (diff >= SHERPA_ONNX_MIN_LOG_DIFF_DOUBLE) {
+      double res;
+      res = x + log1p(exp(diff));
+      return res;
+    }
+
+    return x;  // return the larger one.
+  }
+};
+
+template <>
+struct LogAdd<float> {
+  float operator()(float x, float y) const {
+    float diff;
+
+    if (x < y) {
+      diff = x - y;
+      x = y;
+    } else {
+      diff = y - x;
+    }
+    // diff is negative.  x is now the larger one.
+
+    if (diff >= SHERPA_ONNX_MIN_LOG_DIFF_DOUBLE) {
+      float res;
+      res = x + log1pf(expf(diff));
+      return res;
+    }
+
+    return x;  // return the larger one.
+  }
+};
+
+template <class T>
+void LogSoftmax(T *input, int32_t input_len) {
+  assert(input);
+
+  T m = *std::max_element(input, input + input_len);
+
+  T sum = 0.0;
+  for (int32_t i = 0; i < input_len; i++) {
+    sum += exp(input[i] - m);
+  }
+
+  T offset = m + log(sum);
+  for (int32_t i = 0; i < input_len; i++) {
+    input[i] -= offset;
+  }
+}
+
+template <class T>
+std::vector<int32_t> TopkIndex(const T *vec, int32_t size, int32_t topk) {
+  std::vector<int32_t> vec_index(size);
+  std::iota(vec_index.begin(), vec_index.end(), 0);
+
+  std::sort(vec_index.begin(), vec_index.end(),
+            [vec](int32_t index_1, int32_t index_2) {
+              return vec[index_1] > vec[index_2];
+            });
+
+  int32_t k_num = std::min<int32_t>(size, topk);
+  std::vector<int32_t> index(vec_index.begin(), vec_index.begin() + k_num);
+  return index;
+}
+
+}  // namespace sherpa_onnx
+#endif  // SHERPA_ONNX_CSRC_MATH_H_
--- a/sherpa-onnx/csrc/online-lstm-transducer-model.cc
+++ b/sherpa-onnx/csrc/online-lstm-transducer-model.cc
@@ -247,24 +247,6 @@ OnlineLstmTransducerModel::RunEncoder(Ort::Value features,
  return {std::move(encoder_out[0]), std::move(next_states)};
 }

-Ort::Value OnlineLstmTransducerModel::BuildDecoderInput(
-    const std::vector<OnlineTransducerDecoderResult> &results) {
-  int32_t batch_size = static_cast<int32_t>(results.size());
-  std::array<int64_t, 2> shape{batch_size, context_size_};
-  Ort::Value decoder_input =
-      Ort::Value::CreateTensor<int64_t>(allocator_, shape.data(), shape.size());
-  int64_t *p = decoder_input.GetTensorMutableData<int64_t>();
-
-  for (const auto &r : results) {
-    const int64_t *begin = r.tokens.data() + r.tokens.size() - context_size_;
-    const int64_t *end = r.tokens.data() + r.tokens.size();
-    std::copy(begin, end, p);
-    p += context_size_;
-  }
-
-  return decoder_input;
-}
-
 Ort::Value OnlineLstmTransducerModel::RunDecoder(Ort::Value decoder_input) {
  auto decoder_out = decoder_sess_->Run(
      {}, decoder_input_names_ptr_.data(), &decoder_input, 1,
--- a/sherpa-onnx/csrc/online-lstm-transducer-model.h
+++ b/sherpa-onnx/csrc/online-lstm-transducer-model.h
@@ -40,9 +40,6 @@ class OnlineLstmTransducerModel : public OnlineTransducerModel {
  std::pair<Ort::Value, std::vector<Ort::Value>> RunEncoder(
      Ort::Value features, std::vector<Ort::Value> states) override;

-  Ort::Value BuildDecoderInput(
-      const std::vector<OnlineTransducerDecoderResult> &results) override;
-
  Ort::Value RunDecoder(Ort::Value decoder_input) override;

  Ort::Value RunJoiner(Ort::Value encoder_out, Ort::Value decoder_out) override;
--- a/sherpa-onnx/csrc/online-recognizer.cc
+++ b/sherpa-onnx/csrc/online-recognizer.cc
@@ -1,6 +1,7 @@
 // sherpa-onnx/csrc/online-recognizer.cc
 //
 // Copyright (c)  2023  Xiaomi Corporation
+// Copyright (c)  2023  Pingfeng Luo

 #include "sherpa-onnx/csrc/online-recognizer.h"

@@ -16,6 +17,7 @@
 #include "sherpa-onnx/csrc/online-transducer-decoder.h"
 #include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"
 #include "sherpa-onnx/csrc/online-transducer-model.h"
+#include "sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.h"
 #include "sherpa-onnx/csrc/symbol-table.h"

 namespace sherpa_onnx {
@@ -39,6 +41,11 @@ void OnlineRecognizerConfig::Register(ParseOptions *po) {

  po->Register("enable-endpoint", &enable_endpoint,
               "True to enable endpoint detection. False to disable it.");
+  po->Register("max-active-paths", &max_active_paths,
+               "beam size used in modified beam search.");
+  po->Register("decoding-mothod", &decoding_method,
+               "decoding method,"
+               "now support greedy_search and modified_beam_search.");
 }

 bool OnlineRecognizerConfig::Validate() const {
@@ -52,7 +59,9 @@ std::string OnlineRecognizerConfig::ToString() const {
  os << "feat_config=" << feat_config.ToString() << ", ";
  os << "model_config=" << model_config.ToString() << ", ";
  os << "endpoint_config=" << endpoint_config.ToString() << ", ";
-  os << "enable_endpoint=" << (enable_endpoint ? "True" : "False") << ")";
+  os << "enable_endpoint=" << (enable_endpoint ? "True" : "False") << ",";
+  os << "max_active_paths=" << max_active_paths << ",";
+  os << "decoding_method=\"" << decoding_method << "\")";

  return os.str();
 }
@@ -64,8 +73,17 @@ class OnlineRecognizer::Impl {
        model_(OnlineTransducerModel::Create(config.model_config)),
        sym_(config.model_config.tokens),
        endpoint_(config_.endpoint_config) {
-    decoder_ =
-        std::make_unique<OnlineTransducerGreedySearchDecoder>(model_.get());
+    if (config.decoding_method == "modified_beam_search") {
+      decoder_ = std::make_unique<OnlineTransducerModifiedBeamSearchDecoder>(
+          model_.get(), config_.max_active_paths);
+    } else if (config.decoding_method == "greedy_search") {
+      decoder_ =
+          std::make_unique<OnlineTransducerGreedySearchDecoder>(model_.get());
+    } else {
+      fprintf(stderr, "Unsupported decoding method: %s\n",
+              config.decoding_method.c_str());
+      exit(-1);
+    }
  }

 #if __ANDROID_API__ >= 9
@@ -74,8 +92,17 @@ class OnlineRecognizer::Impl {
        model_(OnlineTransducerModel::Create(mgr, config.model_config)),
        sym_(mgr, config.model_config.tokens),
        endpoint_(config_.endpoint_config) {
-    decoder_ =
-        std::make_unique<OnlineTransducerGreedySearchDecoder>(model_.get());
+    if (config.decoding_method == "modified_beam_search") {
+      decoder_ = std::make_unique<OnlineTransducerModifiedBeamSearchDecoder>(
+          model_.get(), config_.max_active_paths);
+    } else if (config.decoding_method == "greedy_search") {
+      decoder_ =
+          std::make_unique<OnlineTransducerGreedySearchDecoder>(model_.get());
+    } else {
+      fprintf(stderr, "Unsupported decoding method: %s\n",
+              config.decoding_method.c_str());
+      exit(-1);
+    }
  }
 #endif

--- a/sherpa-onnx/csrc/online-recognizer.h
+++ b/sherpa-onnx/csrc/online-recognizer.h
@@ -32,7 +32,11 @@ struct OnlineRecognizerConfig {
  FeatureExtractorConfig feat_config;
  OnlineTransducerModelConfig model_config;
  EndpointConfig endpoint_config;
-  bool enable_endpoint;
+  bool enable_endpoint = true;
+  int32_t max_active_paths = 4;
+
+  std::string decoding_method = "modified_beam_search";
+  // now support modified_beam_search and greedy_search

  OnlineRecognizerConfig() = default;

--- a/sherpa-onnx/csrc/online-transducer-decoder.h
+++ b/sherpa-onnx/csrc/online-transducer-decoder.h
@@ -8,6 +8,7 @@
 #include <vector>

 #include "onnxruntime_cxx_api.h"  // NOLINT
+#include "sherpa-onnx/csrc/hypothesis.h"

 namespace sherpa_onnx {

@@ -17,6 +18,9 @@ struct OnlineTransducerDecoderResult {

  /// number of trailing blank frames decoded so far
  int32_t num_trailing_blanks = 0;
+
+  // used only in modified beam_search
+  Hypotheses hyps;
 };

 class OnlineTransducerDecoder {
--- a/sherpa-onnx/csrc/online-transducer-greedy-search-decoder.cc
+++ b/sherpa-onnx/csrc/online-transducer-greedy-search-decoder.cc
@@ -4,8 +4,6 @@

 #include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"

-#include <assert.h>
-
 #include <algorithm>
 #include <utility>
 #include <vector>
@@ -15,39 +13,6 @@

 namespace sherpa_onnx {

-static Ort::Value GetFrame(OrtAllocator *allocator, Ort::Value *encoder_out,
-                           int32_t t) {
-  std::vector<int64_t> encoder_out_shape =
-      encoder_out->GetTensorTypeAndShapeInfo().GetShape();
-
-  auto batch_size = encoder_out_shape[0];
-  auto num_frames = encoder_out_shape[1];
-  assert(t < num_frames);
-
-  auto encoder_out_dim = encoder_out_shape[2];
-
-  auto offset = num_frames * encoder_out_dim;
-
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
-
-  std::array<int64_t, 2> shape{batch_size, encoder_out_dim};
-
-  Ort::Value ans =
-      Ort::Value::CreateTensor<float>(allocator, shape.data(), shape.size());
-
-  float *dst = ans.GetTensorMutableData<float>();
-  const float *src = encoder_out->GetTensorData<float>();
-
-  for (int32_t i = 0; i != batch_size; ++i) {
-    std::copy(src + t * encoder_out_dim, src + (t + 1) * encoder_out_dim, dst);
-    src += offset;
-    dst += encoder_out_dim;
-  }
-
-  return ans;
-}
-
 OnlineTransducerDecoderResult
 OnlineTransducerGreedySearchDecoder::GetEmptyResult() const {
  int32_t context_size = model_->ContextSize();
@@ -90,7 +55,8 @@ void OnlineTransducerGreedySearchDecoder::Decode(
  Ort::Value decoder_out = model_->RunDecoder(std::move(decoder_input));

  for (int32_t t = 0; t != num_frames; ++t) {
-    Ort::Value cur_encoder_out = GetFrame(model_->Allocator(), &encoder_out, t);
+    Ort::Value cur_encoder_out =
+        GetEncoderOutFrame(model_->Allocator(), &encoder_out, t);
    Ort::Value logit = model_->RunJoiner(
        std::move(cur_encoder_out), Clone(model_->Allocator(), &decoder_out));

--- a/sherpa-onnx/csrc/online-transducer-model.cc
+++ b/sherpa-onnx/csrc/online-transducer-model.cc
@@ -1,6 +1,7 @@
 // sherpa-onnx/csrc/online-transducer-model.cc
 //
 // Copyright (c)  2023  Xiaomi Corporation
+// Copyright (c)  2023  Pingfeng Luo
 #include "sherpa-onnx/csrc/online-transducer-model.h"

 #if __ANDROID_API__ >= 9
@@ -8,6 +9,7 @@
 #include "android/asset_manager_jni.h"
 #endif

+#include <algorithm>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -75,6 +77,40 @@ std::unique_ptr<OnlineTransducerModel> OnlineTransducerModel::Create(
  return nullptr;
 }

+Ort::Value OnlineTransducerModel::BuildDecoderInput(
+    const std::vector<OnlineTransducerDecoderResult> &results) {
+  int32_t batch_size = static_cast<int32_t>(results.size());
+  int32_t context_size = ContextSize();
+  std::array<int64_t, 2> shape{batch_size, context_size};
+  Ort::Value decoder_input = Ort::Value::CreateTensor<int64_t>(
+      Allocator(), shape.data(), shape.size());
+  int64_t *p = decoder_input.GetTensorMutableData<int64_t>();
+
+  for (const auto &r : results) {
+    const int64_t *begin = r.tokens.data() + r.tokens.size() - context_size;
+    const int64_t *end = r.tokens.data() + r.tokens.size();
+    std::copy(begin, end, p);
+    p += context_size;
+  }
+  return decoder_input;
+}
+
+Ort::Value OnlineTransducerModel::BuildDecoderInput(
+    const std::vector<Hypothesis> &hyps) {
+  int32_t batch_size = static_cast<int32_t>(hyps.size());
+  int32_t context_size = ContextSize();
+  std::array<int64_t, 2> shape{batch_size, context_size};
+  Ort::Value decoder_input = Ort::Value::CreateTensor<int64_t>(
+      Allocator(), shape.data(), shape.size());
+  int64_t *p = decoder_input.GetTensorMutableData<int64_t>();
+
+  for (const auto &h : hyps) {
+    std::copy(h.ys.end() - context_size, h.ys.end(), p);
+    p += context_size;
+  }
+  return decoder_input;
+}
+
 #if __ANDROID_API__ >= 9
 std::unique_ptr<OnlineTransducerModel> OnlineTransducerModel::Create(
    AAssetManager *mgr, const OnlineTransducerModelConfig &config) {
--- a/sherpa-onnx/csrc/online-transducer-model.h
+++ b/sherpa-onnx/csrc/online-transducer-model.h
@@ -14,6 +14,8 @@
 #endif

 #include "onnxruntime_cxx_api.h"  // NOLINT
+#include "sherpa-onnx/csrc/hypothesis.h"
+#include "sherpa-onnx/csrc/online-transducer-decoder.h"
 #include "sherpa-onnx/csrc/online-transducer-model-config.h"

 namespace sherpa_onnx {
@@ -71,9 +73,6 @@ class OnlineTransducerModel {
      Ort::Value features,
      std::vector<Ort::Value> states) = 0;  // NOLINT

-  virtual Ort::Value BuildDecoderInput(
-      const std::vector<OnlineTransducerDecoderResult> &results) = 0;
-
  /** Run the decoder network.
   *
   * Caution: We assume there are no recurrent connections in the decoder and
@@ -125,7 +124,13 @@ class OnlineTransducerModel {
  virtual int32_t VocabSize() const = 0;

  virtual int32_t SubsamplingFactor() const { return 4; }
+
  virtual OrtAllocator *Allocator() = 0;
+
+  Ort::Value BuildDecoderInput(
+      const std::vector<OnlineTransducerDecoderResult> &results);
+
+  Ort::Value BuildDecoderInput(const std::vector<Hypothesis> &hyps);
 };

 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.cc
+++ b/sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.cc
@@ -0,0 +1,154 @@
+// sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.cc
+//
+// Copyright (c)  2023  Pingfeng Luo
+// Copyright (c)  2023  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.h"
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "sherpa-onnx/csrc/onnx-utils.h"
+
+namespace sherpa_onnx {
+
+static Ort::Value Repeat(OrtAllocator *allocator, Ort::Value *cur_encoder_out,
+                         const std::vector<int32_t> &hyps_num_split) {
+  std::vector<int64_t> cur_encoder_out_shape =
+      cur_encoder_out->GetTensorTypeAndShapeInfo().GetShape();
+
+  std::array<int64_t, 2> ans_shape{hyps_num_split.back(),
+                                   cur_encoder_out_shape[1]};
+
+  Ort::Value ans = Ort::Value::CreateTensor<float>(allocator, ans_shape.data(),
+                                                   ans_shape.size());
+
+  const float *src = cur_encoder_out->GetTensorData<float>();
+  float *dst = ans.GetTensorMutableData<float>();
+  int32_t batch_size = static_cast<int32_t>(hyps_num_split.size()) - 1;
+  for (int32_t b = 0; b != batch_size; ++b) {
+    int32_t cur_stream_hyps_num = hyps_num_split[b + 1] - hyps_num_split[b];
+    for (int32_t i = 0; i != cur_stream_hyps_num; ++i) {
+      std::copy(src, src + cur_encoder_out_shape[1], dst);
+      dst += cur_encoder_out_shape[1];
+    }
+    src += cur_encoder_out_shape[1];
+  }
+  return ans;
+}
+
+static void LogSoftmax(float *in, int32_t w, int32_t h) {
+  for (int32_t i = 0; i != h; ++i) {
+    LogSoftmax(in, w);
+    in += w;
+  }
+}
+
+OnlineTransducerDecoderResult
+OnlineTransducerModifiedBeamSearchDecoder::GetEmptyResult() const {
+  int32_t context_size = model_->ContextSize();
+  int32_t blank_id = 0;  // always 0
+  OnlineTransducerDecoderResult r;
+  std::vector<int32_t> blanks(context_size, blank_id);
+  Hypotheses blank_hyp({{blanks, 0}});
+  r.hyps = std::move(blank_hyp);
+  return r;
+}
+
+void OnlineTransducerModifiedBeamSearchDecoder::StripLeadingBlanks(
+    OnlineTransducerDecoderResult *r) const {
+  int32_t context_size = model_->ContextSize();
+  auto hyp = r->hyps.GetMostProbable(true);
+
+  std::vector<int64_t> tokens(hyp.ys.begin() + context_size, hyp.ys.end());
+  r->tokens = std::move(tokens);
+  r->num_trailing_blanks = hyp.num_trailing_blanks;
+}
+
+void OnlineTransducerModifiedBeamSearchDecoder::Decode(
+    Ort::Value encoder_out,
+    std::vector<OnlineTransducerDecoderResult> *result) {
+  std::vector<int64_t> encoder_out_shape =
+      encoder_out.GetTensorTypeAndShapeInfo().GetShape();
+
+  if (encoder_out_shape[0] != result->size()) {
+    fprintf(stderr,
+            "Size mismatch! encoder_out.size(0) %d, result.size(0): %d\n",
+            static_cast<int32_t>(encoder_out_shape[0]),
+            static_cast<int32_t>(result->size()));
+    exit(-1);
+  }
+
+  int32_t batch_size = static_cast<int32_t>(encoder_out_shape[0]);
+  int32_t num_frames = static_cast<int32_t>(encoder_out_shape[1]);
+  int32_t vocab_size = model_->VocabSize();
+
+  std::vector<Hypotheses> cur;
+  for (auto &r : *result) {
+    cur.push_back(std::move(r.hyps));
+  }
+  std::vector<Hypothesis> prev;
+
+  for (int32_t t = 0; t != num_frames; ++t) {
+    // Due to merging paths with identical token sequences,
+    // not all utterances have "num_active_paths" paths.
+    int32_t hyps_num_acc = 0;
+    std::vector<int32_t> hyps_num_split;
+    hyps_num_split.push_back(0);
+
+    prev.clear();
+    for (auto &hyps : cur) {
+      for (auto &h : hyps) {
+        prev.push_back(std::move(h.second));
+        hyps_num_acc++;
+      }
+      hyps_num_split.push_back(hyps_num_acc);
+    }
+    cur.clear();
+    cur.reserve(batch_size);
+
+    Ort::Value decoder_input = model_->BuildDecoderInput(prev);
+    Ort::Value decoder_out = model_->RunDecoder(std::move(decoder_input));
+
+    Ort::Value cur_encoder_out =
+        GetEncoderOutFrame(model_->Allocator(), &encoder_out, t);
+    cur_encoder_out =
+        Repeat(model_->Allocator(), &cur_encoder_out, hyps_num_split);
+    Ort::Value logit = model_->RunJoiner(
+        std::move(cur_encoder_out), Clone(model_->Allocator(), &decoder_out));
+    float *p_logit = logit.GetTensorMutableData<float>();
+
+    for (int32_t b = 0; b < batch_size; ++b) {
+      int32_t start = hyps_num_split[b];
+      int32_t end = hyps_num_split[b + 1];
+      LogSoftmax(p_logit, vocab_size, (end - start));
+      auto topk =
+          TopkIndex(p_logit, vocab_size * (end - start), max_active_paths_);
+
+      Hypotheses hyps;
+      for (auto i : topk) {
+        int32_t hyp_index = i / vocab_size + start;
+        int32_t new_token = i % vocab_size;
+
+        Hypothesis new_hyp = prev[hyp_index];
+        if (new_token != 0) {
+          new_hyp.ys.push_back(new_token);
+          new_hyp.num_trailing_blanks = 0;
+        } else {
+          ++new_hyp.num_trailing_blanks;
+        }
+        new_hyp.log_prob += p_logit[i];
+        hyps.Add(std::move(new_hyp));
+      }
+      cur.push_back(std::move(hyps));
+      p_logit += vocab_size * (end - start);
+    }
+  }
+
+  for (int32_t b = 0; b != batch_size; ++b) {
+    (*result)[b].hyps = std::move(cur[b]);
+  }
+}
+
+}  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.h
+++ b/sherpa-onnx/csrc/online-transducer-modified-beam-search-decoder.h
@@ -0,0 +1,37 @@
+// sherpa-onnx/csrc/online-transducer-modified_beam-search-decoder.h
+//
+// Copyright (c)  2023  Pingfeng Luo
+// Copyright (c)  2023  Xiaomi Corporation
+
+#ifndef SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_
+#define SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_
+
+#include <vector>
+
+#include "sherpa-onnx/csrc/online-transducer-decoder.h"
+#include "sherpa-onnx/csrc/online-transducer-model.h"
+
+namespace sherpa_onnx {
+
+class OnlineTransducerModifiedBeamSearchDecoder
+    : public OnlineTransducerDecoder {
+ public:
+  OnlineTransducerModifiedBeamSearchDecoder(OnlineTransducerModel *model,
+                                            int32_t max_active_paths)
+      : model_(model), max_active_paths_(max_active_paths) {}
+
+  OnlineTransducerDecoderResult GetEmptyResult() const override;
+
+  void StripLeadingBlanks(OnlineTransducerDecoderResult *r) const override;
+
+  void Decode(Ort::Value encoder_out,
+              std::vector<OnlineTransducerDecoderResult> *result) override;
+
+ private:
+  OnlineTransducerModel *model_;  // Not owned
+  int32_t max_active_paths_;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_ONLINE_TRANSDUCER_MODIFIED_BEAM_SEARCH_DECODER_H_
--- a/sherpa-onnx/csrc/online-zipformer-transducer-model.cc
+++ b/sherpa-onnx/csrc/online-zipformer-transducer-model.cc
@@ -461,24 +461,6 @@ OnlineZipformerTransducerModel::RunEncoder(Ort::Value features,
  return {std::move(encoder_out[0]), std::move(next_states)};
 }

-Ort::Value OnlineZipformerTransducerModel::BuildDecoderInput(
-    const std::vector<OnlineTransducerDecoderResult> &results) {
-  int32_t batch_size = static_cast<int32_t>(results.size());
-  std::array<int64_t, 2> shape{batch_size, context_size_};
-  Ort::Value decoder_input =
-      Ort::Value::CreateTensor<int64_t>(allocator_, shape.data(), shape.size());
-  int64_t *p = decoder_input.GetTensorMutableData<int64_t>();
-
-  for (const auto &r : results) {
-    const int64_t *begin = r.tokens.data() + r.tokens.size() - context_size_;
-    const int64_t *end = r.tokens.data() + r.tokens.size();
-    std::copy(begin, end, p);
-    p += context_size_;
-  }
-
-  return decoder_input;
-}
-
 Ort::Value OnlineZipformerTransducerModel::RunDecoder(
    Ort::Value decoder_input) {
  auto decoder_out = decoder_sess_->Run(
--- a/sherpa-onnx/csrc/online-zipformer-transducer-model.h
+++ b/sherpa-onnx/csrc/online-zipformer-transducer-model.h
@@ -41,9 +41,6 @@ class OnlineZipformerTransducerModel : public OnlineTransducerModel {
  std::pair<Ort::Value, std::vector<Ort::Value>> RunEncoder(
      Ort::Value features, std::vector<Ort::Value> states) override;

-  Ort::Value BuildDecoderInput(
-      const std::vector<OnlineTransducerDecoderResult> &results) override;
-
  Ort::Value RunDecoder(Ort::Value decoder_input) override;

  Ort::Value RunJoiner(Ort::Value encoder_out, Ort::Value decoder_out) override;
--- a/sherpa-onnx/csrc/onnx-utils.cc
+++ b/sherpa-onnx/csrc/onnx-utils.cc
@@ -44,6 +44,38 @@ void GetOutputNames(Ort::Session *sess, std::vector<std::string> *output_names,
  }
 }

+Ort::Value GetEncoderOutFrame(OrtAllocator *allocator, Ort::Value *encoder_out,
+                              int32_t t) {
+  std::vector<int64_t> encoder_out_shape =
+      encoder_out->GetTensorTypeAndShapeInfo().GetShape();
+
+  auto batch_size = encoder_out_shape[0];
+  auto num_frames = encoder_out_shape[1];
+  assert(t < num_frames);
+
+  auto encoder_out_dim = encoder_out_shape[2];
+
+  auto offset = num_frames * encoder_out_dim;
+
+  auto memory_info =
+      Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+
+  std::array<int64_t, 2> shape{batch_size, encoder_out_dim};
+
+  Ort::Value ans =
+      Ort::Value::CreateTensor<float>(allocator, shape.data(), shape.size());
+
+  float *dst = ans.GetTensorMutableData<float>();
+  const float *src = encoder_out->GetTensorData<float>();
+
+  for (int32_t i = 0; i != batch_size; ++i) {
+    std::copy(src + t * encoder_out_dim, src + (t + 1) * encoder_out_dim, dst);
+    src += offset;
+    dst += encoder_out_dim;
+  }
+  return ans;
+}
+
 void PrintModelMetadata(std::ostream &os, const Ort::ModelMetadata &meta_data) {
  Ort::AllocatorWithDefaultOptions allocator;
  std::vector<Ort::AllocatedStringPtr> v =
--- a/sherpa-onnx/csrc/onnx-utils.h
+++ b/sherpa-onnx/csrc/onnx-utils.h
@@ -10,6 +10,7 @@
 #include <locale>
 #endif

+#include <cassert>
 #include <ostream>
 #include <string>
 #include <vector>
@@ -57,6 +58,17 @@ void GetInputNames(Ort::Session *sess, std::vector<std::string> *input_names,
 void GetOutputNames(Ort::Session *sess, std::vector<std::string> *output_names,
                    std::vector<const char *> *output_names_ptr);

+/**
+ * Get the output frame of Encoder
+ *
+ * @param allocator allocator of onnxruntime
+ * @param encoder_out encoder out tensor
+ * @param t frame_index
+ *
+ */
+Ort::Value GetEncoderOutFrame(OrtAllocator *allocator, Ort::Value *encoder_out,
+                              int32_t t);
+
 void PrintModelMetadata(std::ostream &os,
                        const Ort::ModelMetadata &meta_data);  // NOLINT