Add online stream. (#28)
This commit is contained in:
@@ -3,6 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR})
|
|||||||
add_executable(sherpa-onnx
|
add_executable(sherpa-onnx
|
||||||
features.cc
|
features.cc
|
||||||
online-lstm-transducer-model.cc
|
online-lstm-transducer-model.cc
|
||||||
|
online-stream.cc
|
||||||
online-transducer-greedy-search-decoder.cc
|
online-transducer-greedy-search-decoder.cc
|
||||||
online-transducer-model-config.cc
|
online-transducer-model-config.cc
|
||||||
online-transducer-model.cc
|
online-transducer-model.cc
|
||||||
|
|||||||
@@ -11,16 +11,12 @@
|
|||||||
namespace sherpa_onnx {
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
struct FeatureExtractorConfig {
|
struct FeatureExtractorConfig {
|
||||||
int32_t sampling_rate = 16000;
|
float sampling_rate = 16000;
|
||||||
int32_t feature_dim = 80;
|
int32_t feature_dim = 80;
|
||||||
};
|
};
|
||||||
|
|
||||||
class FeatureExtractor {
|
class FeatureExtractor {
|
||||||
public:
|
public:
|
||||||
/**
|
|
||||||
* @param sampling_rate Sampling rate of the data used to train the model.
|
|
||||||
* @param feature_dim Dimension of the features used to train the model.
|
|
||||||
*/
|
|
||||||
explicit FeatureExtractor(const FeatureExtractorConfig &config = {});
|
explicit FeatureExtractor(const FeatureExtractorConfig &config = {});
|
||||||
~FeatureExtractor();
|
~FeatureExtractor();
|
||||||
|
|
||||||
@@ -32,16 +28,19 @@ class FeatureExtractor {
|
|||||||
*/
|
*/
|
||||||
void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
|
void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
|
||||||
|
|
||||||
// InputFinished() tells the class you won't be providing any
|
/**
|
||||||
// more waveform. This will help flush out the last frame or two
|
* InputFinished() tells the class you won't be providing any
|
||||||
// of features, in the case where snip-edges == false; it also
|
* more waveform. This will help flush out the last frame or two
|
||||||
// affects the return value of IsLastFrame().
|
* of features, in the case where snip-edges == false; it also
|
||||||
|
* affects the return value of IsLastFrame().
|
||||||
|
*/
|
||||||
void InputFinished();
|
void InputFinished();
|
||||||
|
|
||||||
int32_t NumFramesReady() const;
|
int32_t NumFramesReady() const;
|
||||||
|
|
||||||
// Note: IsLastFrame() will only ever return true if you have called
|
/** Note: IsLastFrame() will only ever return true if you have called
|
||||||
// InputFinished() (and this frame is the last frame).
|
* InputFinished() (and this frame is the last frame).
|
||||||
|
*/
|
||||||
bool IsLastFrame(int32_t frame) const;
|
bool IsLastFrame(int32_t frame) const;
|
||||||
|
|
||||||
/** Get n frames starting from the given frame index.
|
/** Get n frames starting from the given frame index.
|
||||||
|
|||||||
89
sherpa-onnx/csrc/online-stream.cc
Normal file
89
sherpa-onnx/csrc/online-stream.cc
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
// sherpa-onnx/csrc/online-stream.cc
|
||||||
|
//
|
||||||
|
// Copyright (c) 2023 Xiaomi Corporation
|
||||||
|
#include "sherpa-onnx/csrc/online-stream.h"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/features.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
class OnlineStream::Impl {
|
||||||
|
public:
|
||||||
|
explicit Impl(const FeatureExtractorConfig &config)
|
||||||
|
: feat_extractor_(config) {}
|
||||||
|
|
||||||
|
void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n) {
|
||||||
|
feat_extractor_.AcceptWaveform(sampling_rate, waveform, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
void InputFinished() { feat_extractor_.InputFinished(); }
|
||||||
|
|
||||||
|
int32_t NumFramesReady() const { return feat_extractor_.NumFramesReady(); }
|
||||||
|
|
||||||
|
bool IsLastFrame(int32_t frame) const {
|
||||||
|
return feat_extractor_.IsLastFrame(frame);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> GetFrames(int32_t frame_index, int32_t n) const {
|
||||||
|
return feat_extractor_.GetFrames(frame_index, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Reset() { feat_extractor_.Reset(); }
|
||||||
|
|
||||||
|
int32_t &GetNumProcessedFrames() { return num_processed_frames_; }
|
||||||
|
|
||||||
|
void SetResult(const OnlineTransducerDecoderResult &r) { result_ = r; }
|
||||||
|
|
||||||
|
const OnlineTransducerDecoderResult &GetResult() const { return result_; }
|
||||||
|
|
||||||
|
int32_t FeatureDim() const { return feat_extractor_.FeatureDim(); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
FeatureExtractor feat_extractor_;
|
||||||
|
int32_t num_processed_frames_ = 0; // before subsampling
|
||||||
|
OnlineTransducerDecoderResult result_;
|
||||||
|
};
|
||||||
|
|
||||||
|
OnlineStream::OnlineStream(const FeatureExtractorConfig &config /*= {}*/)
|
||||||
|
: impl_(std::make_unique<Impl>(config)) {}
|
||||||
|
|
||||||
|
OnlineStream::~OnlineStream() = default;
|
||||||
|
|
||||||
|
void OnlineStream::AcceptWaveform(float sampling_rate, const float *waveform,
|
||||||
|
int32_t n) {
|
||||||
|
impl_->AcceptWaveform(sampling_rate, waveform, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
void OnlineStream::InputFinished() { impl_->InputFinished(); }
|
||||||
|
|
||||||
|
int32_t OnlineStream::NumFramesReady() const { return impl_->NumFramesReady(); }
|
||||||
|
|
||||||
|
bool OnlineStream::IsLastFrame(int32_t frame) const {
|
||||||
|
return impl_->IsLastFrame(frame);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> OnlineStream::GetFrames(int32_t frame_index,
|
||||||
|
int32_t n) const {
|
||||||
|
return impl_->GetFrames(frame_index, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
void OnlineStream::Reset() { impl_->Reset(); }
|
||||||
|
|
||||||
|
int32_t OnlineStream::FeatureDim() const { return impl_->FeatureDim(); }
|
||||||
|
|
||||||
|
int32_t &OnlineStream::GetNumProcessedFrames() {
|
||||||
|
return impl_->GetNumProcessedFrames();
|
||||||
|
}
|
||||||
|
|
||||||
|
void OnlineStream::SetResult(const OnlineTransducerDecoderResult &r) {
|
||||||
|
impl_->SetResult(r);
|
||||||
|
}
|
||||||
|
|
||||||
|
const OnlineTransducerDecoderResult &OnlineStream::GetResult() const {
|
||||||
|
return impl_->GetResult();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
73
sherpa-onnx/csrc/online-stream.h
Normal file
73
sherpa-onnx/csrc/online-stream.h
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
// sherpa-onnx/csrc/online-stream.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2023 Xiaomi Corporation
|
||||||
|
|
||||||
|
#ifndef SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
|
||||||
|
#define SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "sherpa-onnx/csrc/features.h"
|
||||||
|
#include "sherpa-onnx/csrc/online-transducer-decoder.h"
|
||||||
|
|
||||||
|
namespace sherpa_onnx {
|
||||||
|
|
||||||
|
class OnlineStream {
|
||||||
|
public:
|
||||||
|
explicit OnlineStream(const FeatureExtractorConfig &config = {});
|
||||||
|
~OnlineStream();
|
||||||
|
|
||||||
|
/**
|
||||||
|
@param sampling_rate The sampling_rate of the input waveform. Should match
|
||||||
|
the one expected by the feature extractor.
|
||||||
|
@param waveform Pointer to a 1-D array of size n
|
||||||
|
@param n Number of entries in waveform
|
||||||
|
*/
|
||||||
|
void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* InputFinished() tells the class you won't be providing any
|
||||||
|
* more waveform. This will help flush out the last frame or two
|
||||||
|
* of features, in the case where snip-edges == false; it also
|
||||||
|
* affects the return value of IsLastFrame().
|
||||||
|
*/
|
||||||
|
void InputFinished();
|
||||||
|
|
||||||
|
int32_t NumFramesReady() const;
|
||||||
|
|
||||||
|
/** Note: IsLastFrame() will only ever return true if you have called
|
||||||
|
* InputFinished() (and this frame is the last frame).
|
||||||
|
*/
|
||||||
|
bool IsLastFrame(int32_t frame) const;
|
||||||
|
|
||||||
|
/** Get n frames starting from the given frame index.
|
||||||
|
*
|
||||||
|
* @param frame_index The starting frame index
|
||||||
|
* @param n Number of frames to get.
|
||||||
|
* @return Return a 2-D tensor of shape (n, feature_dim).
|
||||||
|
* which is flattened into a 1-D vector (flattened in in row major)
|
||||||
|
*/
|
||||||
|
std::vector<float> GetFrames(int32_t frame_index, int32_t n) const;
|
||||||
|
|
||||||
|
void Reset();
|
||||||
|
|
||||||
|
int32_t FeatureDim() const;
|
||||||
|
|
||||||
|
// Return a reference to the number of processed frames so far.
|
||||||
|
// Initially, it is 0. It is always less than NumFramesReady().
|
||||||
|
//
|
||||||
|
// The returned reference is valid as long as this object is alive.
|
||||||
|
int32_t &GetNumProcessedFrames();
|
||||||
|
|
||||||
|
void SetResult(const OnlineTransducerDecoderResult &r);
|
||||||
|
const OnlineTransducerDecoderResult &GetResult() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
class Impl;
|
||||||
|
std::unique_ptr<Impl> impl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace sherpa_onnx
|
||||||
|
|
||||||
|
#endif // SHERPA_ONNX_CSRC_ONLINE_STREAM_H_
|
||||||
@@ -8,8 +8,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "kaldi-native-fbank/csrc/online-feature.h"
|
#include "sherpa-onnx/csrc/online-stream.h"
|
||||||
#include "sherpa-onnx/csrc/features.h"
|
|
||||||
#include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"
|
#include "sherpa-onnx/csrc/online-transducer-greedy-search-decoder.h"
|
||||||
#include "sherpa-onnx/csrc/online-transducer-model-config.h"
|
#include "sherpa-onnx/csrc/online-transducer-model-config.h"
|
||||||
#include "sherpa-onnx/csrc/online-transducer-model.h"
|
#include "sherpa-onnx/csrc/online-transducer-model.h"
|
||||||
@@ -64,7 +63,7 @@ for a list of pre-trained models to download.
|
|||||||
|
|
||||||
std::vector<Ort::Value> states = model->GetEncoderInitStates();
|
std::vector<Ort::Value> states = model->GetEncoderInitStates();
|
||||||
|
|
||||||
int32_t expected_sampling_rate = 16000;
|
float expected_sampling_rate = 16000;
|
||||||
|
|
||||||
bool is_ok = false;
|
bool is_ok = false;
|
||||||
std::vector<float> samples =
|
std::vector<float> samples =
|
||||||
@@ -75,7 +74,7 @@ for a list of pre-trained models to download.
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
float duration = samples.size() / static_cast<float>(expected_sampling_rate);
|
float duration = samples.size() / expected_sampling_rate;
|
||||||
|
|
||||||
fprintf(stderr, "wav filename: %s\n", wav_filename.c_str());
|
fprintf(stderr, "wav filename: %s\n", wav_filename.c_str());
|
||||||
fprintf(stderr, "wav duration (s): %.3f\n", duration);
|
fprintf(stderr, "wav duration (s): %.3f\n", duration);
|
||||||
@@ -83,32 +82,33 @@ for a list of pre-trained models to download.
|
|||||||
auto begin = std::chrono::steady_clock::now();
|
auto begin = std::chrono::steady_clock::now();
|
||||||
fprintf(stderr, "Started\n");
|
fprintf(stderr, "Started\n");
|
||||||
|
|
||||||
sherpa_onnx::FeatureExtractor feat_extractor;
|
sherpa_onnx::OnlineStream stream;
|
||||||
feat_extractor.AcceptWaveform(expected_sampling_rate, samples.data(),
|
stream.AcceptWaveform(expected_sampling_rate, samples.data(), samples.size());
|
||||||
samples.size());
|
|
||||||
|
|
||||||
std::vector<float> tail_paddings(
|
std::vector<float> tail_paddings(
|
||||||
static_cast<int>(0.2 * expected_sampling_rate));
|
static_cast<int>(0.2 * expected_sampling_rate));
|
||||||
feat_extractor.AcceptWaveform(expected_sampling_rate, tail_paddings.data(),
|
stream.AcceptWaveform(expected_sampling_rate, tail_paddings.data(),
|
||||||
tail_paddings.size());
|
tail_paddings.size());
|
||||||
feat_extractor.InputFinished();
|
stream.InputFinished();
|
||||||
|
|
||||||
int32_t num_frames = feat_extractor.NumFramesReady();
|
int32_t num_frames = stream.NumFramesReady();
|
||||||
int32_t feature_dim = feat_extractor.FeatureDim();
|
int32_t feature_dim = stream.FeatureDim();
|
||||||
|
|
||||||
std::array<int64_t, 3> x_shape{1, chunk_size, feature_dim};
|
std::array<int64_t, 3> x_shape{1, chunk_size, feature_dim};
|
||||||
|
|
||||||
sherpa_onnx::OnlineTransducerGreedySearchDecoder decoder(model.get());
|
sherpa_onnx::OnlineTransducerGreedySearchDecoder decoder(model.get());
|
||||||
std::vector<sherpa_onnx::OnlineTransducerDecoderResult> result = {
|
std::vector<sherpa_onnx::OnlineTransducerDecoderResult> result = {
|
||||||
decoder.GetEmptyResult()};
|
decoder.GetEmptyResult()};
|
||||||
|
while (stream.NumFramesReady() - stream.GetNumProcessedFrames() >
|
||||||
for (int32_t start = 0; start + chunk_size < num_frames;
|
chunk_size) {
|
||||||
start += chunk_shift) {
|
std::vector<float> features =
|
||||||
std::vector<float> features = feat_extractor.GetFrames(start, chunk_size);
|
stream.GetFrames(stream.GetNumProcessedFrames(), chunk_size);
|
||||||
|
stream.GetNumProcessedFrames() += chunk_shift;
|
||||||
|
|
||||||
Ort::Value x =
|
Ort::Value x =
|
||||||
Ort::Value::CreateTensor(memory_info, features.data(), features.size(),
|
Ort::Value::CreateTensor(memory_info, features.data(), features.size(),
|
||||||
x_shape.data(), x_shape.size());
|
x_shape.data(), x_shape.size());
|
||||||
|
|
||||||
auto pair = model->RunEncoder(std::move(x), states);
|
auto pair = model->RunEncoder(std::move(x), states);
|
||||||
states = std::move(pair.second);
|
states = std::move(pair.second);
|
||||||
decoder.Decode(std::move(pair.first), &result);
|
decoder.Decode(std::move(pair.first), &result);
|
||||||
@@ -116,8 +116,8 @@ for a list of pre-trained models to download.
|
|||||||
decoder.StripLeadingBlanks(&result[0]);
|
decoder.StripLeadingBlanks(&result[0]);
|
||||||
const auto &hyp = result[0].tokens;
|
const auto &hyp = result[0].tokens;
|
||||||
std::string text;
|
std::string text;
|
||||||
for (size_t i = model->ContextSize(); i != hyp.size(); ++i) {
|
for (auto t : hyp) {
|
||||||
text += sym[hyp[i]];
|
text += sym[t];
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "Done!\n");
|
fprintf(stderr, "Done!\n");
|
||||||
|
|||||||
Reference in New Issue
Block a user