Add online stream. (#28)

2023-02-19 11:42:15 +08:00
parent 0f6f58d1d3
commit d4b0c0590a
5 changed files with 191 additions and 29 deletions
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@@ -3,6 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR})
 add_executable(sherpa-onnx
  features.cc
  online-lstm-transducer-model.cc
  online-stream.cc
  online-transducer-greedy-search-decoder.cc
  online-transducer-model-config.cc
  online-transducer-model.cc
--- a/sherpa-onnx/csrc/features.h
+++ b/sherpa-onnx/csrc/features.h
@@ -11,16 +11,12 @@
 namespace sherpa_onnx {
 struct FeatureExtractorConfig {
-  int32_t sampling_rate = 16000;
+  float sampling_rate = 16000;
  int32_t feature_dim = 80;
 };
 class FeatureExtractor {
 public:
  /**
   * @param sampling_rate  Sampling rate of the data used to train the model.
   * @param feature_dim    Dimension of the features used to train the model.
   */
  explicit FeatureExtractor(const FeatureExtractorConfig &config = {});
  ~FeatureExtractor();
@@ -32,16 +28,19 @@ class FeatureExtractor {
   */
  void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
-  // InputFinished() tells the class you won't be providing any
+  /**
-  // more waveform.  This will help flush out the last frame or two
+   * InputFinished() tells the class you won't be providing any
-  // of features, in the case where snip-edges == false; it also
+   * more waveform.  This will help flush out the last frame or two
-  // affects the return value of IsLastFrame().
+   * of features, in the case where snip-edges == false; it also
   * affects the return value of IsLastFrame().
   */
  void InputFinished();
  int32_t NumFramesReady() const;
-  // Note: IsLastFrame() will only ever return true if you have called
+  /** Note: IsLastFrame() will only ever return true if you have called
-  // InputFinished() (and this frame is the last frame).
+   * InputFinished() (and this frame is the last frame).
   */
  bool IsLastFrame(int32_t frame) const;
  /** Get n frames starting from the given frame index.
--- a/sherpa-onnx/csrc/online-stream.cc
+++ b/sherpa-onnx/csrc/online-stream.cc
@@ -0,0 +1,89 @@
 // sherpa-onnx/csrc/online-stream.cc
 //
 // Copyright (c)  2023  Xiaomi Corporation
 #include "sherpa-onnx/csrc/online-stream.h"
 #include <memory>
 #include <vector>
 #include "sherpa-onnx/csrc/features.h"
 namespace sherpa_onnx {
 class OnlineStream::Impl {
 public:
  explicit Impl(const FeatureExtractorConfig &config)
      : feat_extractor_(config) {}
  void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n) {
    feat_extractor_.AcceptWaveform(sampling_rate, waveform, n);
  }
  void InputFinished() { feat_extractor_.InputFinished(); }
  int32_t NumFramesReady() const { return feat_extractor_.NumFramesReady(); }
  bool IsLastFrame(int32_t frame) const {
    return feat_extractor_.IsLastFrame(frame);
  }
  std::vector<float> GetFrames(int32_t frame_index, int32_t n) const {
    return feat_extractor_.GetFrames(frame_index, n);
  }
  void Reset() { feat_extractor_.Reset(); }
  int32_t &GetNumProcessedFrames() { return num_processed_frames_; }
  void SetResult(const OnlineTransducerDecoderResult &r) { result_ = r; }
  const OnlineTransducerDecoderResult &GetResult() const { return result_; }
  int32_t FeatureDim() const { return feat_extractor_.FeatureDim(); }
 private:
  FeatureExtractor feat_extractor_;
  int32_t num_processed_frames_ = 0;  // before subsampling
  OnlineTransducerDecoderResult result_;
 };
 OnlineStream::OnlineStream(const FeatureExtractorConfig &config /*= {}*/)
    : impl_(std::make_unique<Impl>(config)) {}
 OnlineStream::~OnlineStream() = default;
 void OnlineStream::AcceptWaveform(float sampling_rate, const float *waveform,
                                  int32_t n) {
  impl_->AcceptWaveform(sampling_rate, waveform, n);
 }
 void OnlineStream::InputFinished() { impl_->InputFinished(); }
 int32_t OnlineStream::NumFramesReady() const { return impl_->NumFramesReady(); }
 bool OnlineStream::IsLastFrame(int32_t frame) const {
  return impl_->IsLastFrame(frame);
 }
 std::vector<float> OnlineStream::GetFrames(int32_t frame_index,
                                           int32_t n) const {
  return impl_->GetFrames(frame_index, n);
 }
 void OnlineStream::Reset() { impl_->Reset(); }
 int32_t OnlineStream::FeatureDim() const { return impl_->FeatureDim(); }
 int32_t &OnlineStream::GetNumProcessedFrames() {
  return impl_->GetNumProcessedFrames();
 }
 void OnlineStream::SetResult(const OnlineTransducerDecoderResult &r) {
  impl_->SetResult(r);
 }
 const OnlineTransducerDecoderResult &OnlineStream::GetResult() const {
  return impl_->GetResult();
 }
 }  // namespace sherpa_onnx
--- a/sherpa-onnx/csrc/online-stream.h
+++ b/sherpa-onnx/csrc/online-stream.h
@@ -0,0 +1,73 @@
 // sherpa-onnx/csrc/online-stream.h
 //
 // Copyright (c)  2023  Xiaomi Corporation
 #ifndef SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
 #define SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
 #include <memory>
 #include <vector>
 #include "sherpa-onnx/csrc/features.h"
 #include "sherpa-onnx/csrc/online-transducer-decoder.h"
 namespace sherpa_onnx {
 class OnlineStream {
 public:
  explicit OnlineStream(const FeatureExtractorConfig &config = {});
  ~OnlineStream();
  /**
     @param sampling_rate The sampling_rate of the input waveform. Should match
                          the one expected by the feature extractor.
     @param waveform Pointer to a 1-D array of size n
     @param n Number of entries in waveform
   */
  void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
  /**
   * InputFinished() tells the class you won't be providing any
   * more waveform.  This will help flush out the last frame or two
   * of features, in the case where snip-edges == false; it also
   * affects the return value of IsLastFrame().
   */
  void InputFinished();
  int32_t NumFramesReady() const;
  /** Note: IsLastFrame() will only ever return true if you have called
   * InputFinished() (and this frame is the last frame).
   */
  bool IsLastFrame(int32_t frame) const;
  /** Get n frames starting from the given frame index.
   *
   * @param frame_index  The starting frame index
   * @param n  Number of frames to get.
   * @return Return a 2-D tensor of shape (n, feature_dim).
   *         which is flattened into a 1-D vector (flattened in in row major)
   */
  std::vector<float> GetFrames(int32_t frame_index, int32_t n) const;
  void Reset();
  int32_t FeatureDim() const;
  // Return a reference to the number of processed frames so far.
  // Initially, it is 0. It is always less than NumFramesReady().
  //
  // The returned reference is valid as long as this object is alive.
  int32_t &GetNumProcessedFrames();
  void SetResult(const OnlineTransducerDecoderResult &r);
  const OnlineTransducerDecoderResult &GetResult() const;
 private:
  class Impl;
  std::unique_ptr<Impl> impl_;
 };
 }  // namespace sherpa_onnx
 #endif  // SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
--- a/sherpa-onnx/csrc/sherpa-onnx.cc
+++ b/sherpa-onnx/csrc/sherpa-onnx.cc
@@ -8,8 +8,7 @@
 #include <string>
 #include <vector>
-#include "kaldi-native-fbank/csrc/online-feature.h"
+#include "sherpa-onnx/csrc/online-stream.h"
 #include "sherpa-onnx/csrc/features.h"
 #include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"
 #include "sherpa-onnx/csrc/online-transducer-model-config.h"
 #include "sherpa-onnx/csrc/online-transducer-model.h"
@@ -64,7 +63,7 @@ for a list of pre-trained models to download.
  std::vector<Ort::Value> states = model->GetEncoderInitStates();
-  int32_t expected_sampling_rate = 16000;
+  float expected_sampling_rate = 16000;
  bool is_ok = false;
  std::vector<float> samples =
@@ -75,7 +74,7 @@ for a list of pre-trained models to download.
    return -1;
  }
-  float duration = samples.size() / static_cast<float>(expected_sampling_rate);
+  float duration = samples.size() / expected_sampling_rate;
  fprintf(stderr, "wav filename: %s\n", wav_filename.c_str());
  fprintf(stderr, "wav duration (s): %.3f\n", duration);
@@ -83,32 +82,33 @@ for a list of pre-trained models to download.
  auto begin = std::chrono::steady_clock::now();
  fprintf(stderr, "Started\n");
-  sherpa_onnx::FeatureExtractor feat_extractor;
+  sherpa_onnx::OnlineStream stream;
-  feat_extractor.AcceptWaveform(expected_sampling_rate, samples.data(),
+  stream.AcceptWaveform(expected_sampling_rate, samples.data(), samples.size());
                                samples.size());
  std::vector<float> tail_paddings(
      static_cast<int>(0.2 * expected_sampling_rate));
-  feat_extractor.AcceptWaveform(expected_sampling_rate, tail_paddings.data(),
+  stream.AcceptWaveform(expected_sampling_rate, tail_paddings.data(),
                        tail_paddings.size());
-  feat_extractor.InputFinished();
+  stream.InputFinished();
-  int32_t num_frames = feat_extractor.NumFramesReady();
+  int32_t num_frames = stream.NumFramesReady();
-  int32_t feature_dim = feat_extractor.FeatureDim();
+  int32_t feature_dim = stream.FeatureDim();
  std::array<int64_t, 3> x_shape{1, chunk_size, feature_dim};
  sherpa_onnx::OnlineTransducerGreedySearchDecoder decoder(model.get());
  std::vector<sherpa_onnx::OnlineTransducerDecoderResult> result = {
      decoder.GetEmptyResult()};
-
+  while (stream.NumFramesReady() - stream.GetNumProcessedFrames() >
-  for (int32_t start = 0; start + chunk_size < num_frames;
+         chunk_size) {
-       start += chunk_shift) {
+    std::vector<float> features =
-    std::vector<float> features = feat_extractor.GetFrames(start, chunk_size);
+        stream.GetFrames(stream.GetNumProcessedFrames(), chunk_size);
    stream.GetNumProcessedFrames() += chunk_shift;
    Ort::Value x =
        Ort::Value::CreateTensor(memory_info, features.data(), features.size(),
                                 x_shape.data(), x_shape.size());
    auto pair = model->RunEncoder(std::move(x), states);
    states = std::move(pair.second);
    decoder.Decode(std::move(pair.first), &result);
@@ -116,8 +116,8 @@ for a list of pre-trained models to download.
  decoder.StripLeadingBlanks(&result[0]);
  const auto &hyp = result[0].tokens;
  std::string text;
-  for (size_t i = model->ContextSize(); i != hyp.size(); ++i) {
+  for (auto t : hyp) {
-    text += sym[hyp[i]];
+    text += sym[t];
  }
  fprintf(stderr, "Done!\n");