Support streaming zipformer CTC (#496)

* Support streaming zipformer CTC

* test online zipformer2 CTC

* Update doc of sherpa-onnx.cc

* Add Python APIs for streaming zipformer2 ctc

* Add Python API examples for streaming zipformer2 ctc

* Swift API for streaming zipformer2 CTC

* NodeJS API for streaming zipformer2 CTC

* Kotlin API for streaming zipformer2 CTC

* Golang API for streaming zipformer2 CTC

* C# API for streaming zipformer2 CTC

* Release v1.9.6
This commit is contained in:
Fangjun Kuang
2023-12-22 13:46:33 +08:00
committed by GitHub
parent 7634f5f034
commit e475e750ac
70 changed files with 1517 additions and 211 deletions

View File

@@ -54,6 +54,9 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer(
recognizer_config.model_config.paraformer.decoder =
SHERPA_ONNX_OR(config->model_config.paraformer.decoder, "");
recognizer_config.model_config.zipformer2_ctc.model =
SHERPA_ONNX_OR(config->model_config.zipformer2_ctc.model, "");
recognizer_config.model_config.tokens =
SHERPA_ONNX_OR(config->model_config.tokens, "");
recognizer_config.model_config.num_threads =

View File

@@ -66,9 +66,17 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineParaformerModelConfig {
const char *decoder;
} SherpaOnnxOnlineParaformerModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxModelConfig {
// Please visit
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-ctc/zipformer-ctc-models.html#
// to download pre-trained streaming zipformer2 ctc models
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineZipformer2CtcModelConfig {
const char *model;
} SherpaOnnxOnlineZipformer2CtcModelConfig;
SHERPA_ONNX_API typedef struct SherpaOnnxOnlineModelConfig {
SherpaOnnxOnlineTransducerModelConfig transducer;
SherpaOnnxOnlineParaformerModelConfig paraformer;
SherpaOnnxOnlineZipformer2CtcModelConfig zipformer2_ctc;
const char *tokens;
int32_t num_threads;
const char *provider;

View File

@@ -70,6 +70,8 @@ set(sources
online-wenet-ctc-model-config.cc
online-wenet-ctc-model.cc
online-zipformer-transducer-model.cc
online-zipformer2-ctc-model-config.cc
online-zipformer2-ctc-model.cc
online-zipformer2-transducer-model.cc
onnx-utils.cc
packed-sequence.cc

View File

@@ -12,6 +12,9 @@
namespace sherpa_onnx {
struct OnlineCtcDecoderResult {
/// Number of frames after subsampling we have decoded so far
int32_t frame_offset = 0;
/// The decoded token IDs
std::vector<int64_t> tokens;

View File

@@ -49,12 +49,17 @@ void OnlineCtcGreedySearchDecoder::Decode(
if (y != blank_id_ && y != prev_id) {
r.tokens.push_back(y);
r.timestamps.push_back(t);
r.timestamps.push_back(t + r.frame_offset);
}
prev_id = y;
} // for (int32_t t = 0; t != num_frames; ++t) {
} // for (int32_t b = 0; b != batch_size; ++b)
// Update frame_offset
for (auto &r : *results) {
r.frame_offset += num_frames;
}
}
} // namespace sherpa_onnx

View File

@@ -11,127 +11,35 @@
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/online-wenet-ctc-model.h"
#include "sherpa-onnx/csrc/online-zipformer2-ctc-model.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
namespace {
enum class ModelType {
kZipformerCtc,
kWenetCtc,
kUnkown,
};
} // namespace
namespace sherpa_onnx {
static ModelType GetModelType(char *model_data, size_t model_data_length,
bool debug) {
Ort::Env env(ORT_LOGGING_LEVEL_WARNING);
Ort::SessionOptions sess_opts;
auto sess = std::make_unique<Ort::Session>(env, model_data, model_data_length,
sess_opts);
Ort::ModelMetadata meta_data = sess->GetModelMetadata();
if (debug) {
std::ostringstream os;
PrintModelMetadata(os, meta_data);
SHERPA_ONNX_LOGE("%s", os.str().c_str());
}
Ort::AllocatorWithDefaultOptions allocator;
auto model_type =
meta_data.LookupCustomMetadataMapAllocated("model_type", allocator);
if (!model_type) {
SHERPA_ONNX_LOGE(
"No model_type in the metadata!\n"
"If you are using models from WeNet, please refer to\n"
"https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/"
"run.sh\n"
"\n"
"for how to add metadta to model.onnx\n");
return ModelType::kUnkown;
}
if (model_type.get() == std::string("zipformer2")) {
return ModelType::kZipformerCtc;
} else if (model_type.get() == std::string("wenet_ctc")) {
return ModelType::kWenetCtc;
} else {
SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.get());
return ModelType::kUnkown;
}
}
std::unique_ptr<OnlineCtcModel> OnlineCtcModel::Create(
const OnlineModelConfig &config) {
ModelType model_type = ModelType::kUnkown;
std::string filename;
if (!config.wenet_ctc.model.empty()) {
filename = config.wenet_ctc.model;
return std::make_unique<OnlineWenetCtcModel>(config);
} else if (!config.zipformer2_ctc.model.empty()) {
return std::make_unique<OnlineZipformer2CtcModel>(config);
} else {
SHERPA_ONNX_LOGE("Please specify a CTC model");
exit(-1);
}
{
auto buffer = ReadFile(filename);
model_type = GetModelType(buffer.data(), buffer.size(), config.debug);
}
switch (model_type) {
case ModelType::kZipformerCtc:
return nullptr;
// return std::make_unique<OnlineZipformerCtcModel>(config);
break;
case ModelType::kWenetCtc:
return std::make_unique<OnlineWenetCtcModel>(config);
break;
case ModelType::kUnkown:
SHERPA_ONNX_LOGE("Unknown model type in online CTC!");
return nullptr;
}
return nullptr;
}
#if __ANDROID_API__ >= 9
std::unique_ptr<OnlineCtcModel> OnlineCtcModel::Create(
AAssetManager *mgr, const OnlineModelConfig &config) {
ModelType model_type = ModelType::kUnkown;
std::string filename;
if (!config.wenet_ctc.model.empty()) {
filename = config.wenet_ctc.model;
return std::make_unique<OnlineWenetCtcModel>(mgr, config);
} else if (!config.zipformer2_ctc.model.empty()) {
return std::make_unique<OnlineZipformer2CtcModel>(mgr, config);
} else {
SHERPA_ONNX_LOGE("Please specify a CTC model");
exit(-1);
}
{
auto buffer = ReadFile(mgr, filename);
model_type = GetModelType(buffer.data(), buffer.size(), config.debug);
}
switch (model_type) {
case ModelType::kZipformerCtc:
return nullptr;
// return std::make_unique<OnlineZipformerCtcModel>(mgr, config);
break;
case ModelType::kWenetCtc:
return std::make_unique<OnlineWenetCtcModel>(mgr, config);
break;
case ModelType::kUnkown:
SHERPA_ONNX_LOGE("Unknown model type in online CTC!");
return nullptr;
}
return nullptr;
}
#endif

View File

@@ -33,6 +33,26 @@ class OnlineCtcModel {
// Return a list of tensors containing the initial states
virtual std::vector<Ort::Value> GetInitStates() const = 0;
/** Stack a list of individual states into a batch.
*
* It is the inverse operation of `UnStackStates`.
*
* @param states states[i] contains the state for the i-th utterance.
* @return Return a single value representing the batched state.
*/
virtual std::vector<Ort::Value> StackStates(
std::vector<std::vector<Ort::Value>> states) const = 0;
/** Unstack a batch state into a list of individual states.
*
* It is the inverse operation of `StackStates`.
*
* @param states A batched state.
* @return ans[i] contains the state for the i-th utterance.
*/
virtual std::vector<std::vector<Ort::Value>> UnStackStates(
std::vector<Ort::Value> states) const = 0;
/**
*
* @param x A 3-D tensor of shape (N, T, C). N has to be 1.
@@ -60,6 +80,9 @@ class OnlineCtcModel {
// ChunkLength() frames, we advance by ChunkShift() frames
// before we process the next chunk.
virtual int32_t ChunkShift() const = 0;
// Return true if the model supports batch size > 1
virtual bool SupportBatchProcessing() const { return true; }
};
} // namespace sherpa_onnx

View File

@@ -14,6 +14,7 @@ void OnlineModelConfig::Register(ParseOptions *po) {
transducer.Register(po);
paraformer.Register(po);
wenet_ctc.Register(po);
zipformer2_ctc.Register(po);
po->Register("tokens", &tokens, "Path to tokens.txt");
@@ -26,10 +27,11 @@ void OnlineModelConfig::Register(ParseOptions *po) {
po->Register("provider", &provider,
"Specify a provider to use: cpu, cuda, coreml");
po->Register("model-type", &model_type,
"Specify it to reduce model initialization time. "
"Valid values are: conformer, lstm, zipformer, zipformer2."
"All other values lead to loading the model twice.");
po->Register(
"model-type", &model_type,
"Specify it to reduce model initialization time. "
"Valid values are: conformer, lstm, zipformer, zipformer2, wenet_ctc"
"All other values lead to loading the model twice.");
}
bool OnlineModelConfig::Validate() const {
@@ -51,6 +53,10 @@ bool OnlineModelConfig::Validate() const {
return wenet_ctc.Validate();
}
if (!zipformer2_ctc.model.empty()) {
return zipformer2_ctc.Validate();
}
return transducer.Validate();
}
@@ -61,6 +67,7 @@ std::string OnlineModelConfig::ToString() const {
os << "transducer=" << transducer.ToString() << ", ";
os << "paraformer=" << paraformer.ToString() << ", ";
os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
os << "zipformer2_ctc=" << zipformer2_ctc.ToString() << ", ";
os << "tokens=\"" << tokens << "\", ";
os << "num_threads=" << num_threads << ", ";
os << "debug=" << (debug ? "True" : "False") << ", ";

View File

@@ -9,6 +9,7 @@
#include "sherpa-onnx/csrc/online-paraformer-model-config.h"
#include "sherpa-onnx/csrc/online-transducer-model-config.h"
#include "sherpa-onnx/csrc/online-wenet-ctc-model-config.h"
#include "sherpa-onnx/csrc/online-zipformer2-ctc-model-config.h"
namespace sherpa_onnx {
@@ -16,6 +17,7 @@ struct OnlineModelConfig {
OnlineTransducerModelConfig transducer;
OnlineParaformerModelConfig paraformer;
OnlineWenetCtcModelConfig wenet_ctc;
OnlineZipformer2CtcModelConfig zipformer2_ctc;
std::string tokens;
int32_t num_threads = 1;
bool debug = false;
@@ -25,7 +27,8 @@ struct OnlineModelConfig {
// - conformer, conformer transducer from icefall
// - lstm, lstm transducer from icefall
// - zipformer, zipformer transducer from icefall
// - zipformer2, zipformer2 transducer from icefall
// - zipformer2, zipformer2 transducer or CTC from icefall
// - wenet_ctc, wenet CTC model
//
// All other values are invalid and lead to loading the model twice.
std::string model_type;
@@ -34,11 +37,13 @@ struct OnlineModelConfig {
OnlineModelConfig(const OnlineTransducerModelConfig &transducer,
const OnlineParaformerModelConfig &paraformer,
const OnlineWenetCtcModelConfig &wenet_ctc,
const OnlineZipformer2CtcModelConfig &zipformer2_ctc,
const std::string &tokens, int32_t num_threads, bool debug,
const std::string &provider, const std::string &model_type)
: transducer(transducer),
paraformer(paraformer),
wenet_ctc(wenet_ctc),
zipformer2_ctc(zipformer2_ctc),
tokens(tokens),
num_threads(num_threads),
debug(debug),

View File

@@ -96,8 +96,67 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl {
}
void DecodeStreams(OnlineStream **ss, int32_t n) const override {
if (n == 1 || !model_->SupportBatchProcessing()) {
for (int32_t i = 0; i != n; ++i) {
DecodeStream(ss[i]);
}
return;
}
// batch processing
int32_t chunk_length = model_->ChunkLength();
int32_t chunk_shift = model_->ChunkShift();
int32_t feat_dim = ss[0]->FeatureDim();
std::vector<OnlineCtcDecoderResult> results(n);
std::vector<float> features_vec(n * chunk_length * feat_dim);
std::vector<std::vector<Ort::Value>> states_vec(n);
std::vector<int64_t> all_processed_frames(n);
for (int32_t i = 0; i != n; ++i) {
DecodeStream(ss[i]);
const auto num_processed_frames = ss[i]->GetNumProcessedFrames();
std::vector<float> features =
ss[i]->GetFrames(num_processed_frames, chunk_length);
// Question: should num_processed_frames include chunk_shift?
ss[i]->GetNumProcessedFrames() += chunk_shift;
std::copy(features.begin(), features.end(),
features_vec.data() + i * chunk_length * feat_dim);
results[i] = std::move(ss[i]->GetCtcResult());
states_vec[i] = std::move(ss[i]->GetStates());
all_processed_frames[i] = num_processed_frames;
}
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
std::array<int64_t, 3> x_shape{n, chunk_length, feat_dim};
Ort::Value x = Ort::Value::CreateTensor(memory_info, features_vec.data(),
features_vec.size(), x_shape.data(),
x_shape.size());
auto states = model_->StackStates(std::move(states_vec));
int32_t num_states = states.size();
auto out = model_->Forward(std::move(x), std::move(states));
std::vector<Ort::Value> out_states;
out_states.reserve(num_states);
for (int32_t k = 1; k != num_states + 1; ++k) {
out_states.push_back(std::move(out[k]));
}
std::vector<std::vector<Ort::Value>> next_states =
model_->UnStackStates(std::move(out_states));
decoder_->Decode(std::move(out[0]), &results);
for (int32_t k = 0; k != n; ++k) {
ss[k]->SetCtcResult(results[k]);
ss[k]->SetStates(std::move(next_states[k]));
}
}

View File

@@ -20,7 +20,8 @@ std::unique_ptr<OnlineRecognizerImpl> OnlineRecognizerImpl::Create(
return std::make_unique<OnlineRecognizerParaformerImpl>(config);
}
if (!config.model_config.wenet_ctc.model.empty()) {
if (!config.model_config.wenet_ctc.model.empty() ||
!config.model_config.zipformer2_ctc.model.empty()) {
return std::make_unique<OnlineRecognizerCtcImpl>(config);
}
@@ -39,7 +40,8 @@ std::unique_ptr<OnlineRecognizerImpl> OnlineRecognizerImpl::Create(
return std::make_unique<OnlineRecognizerParaformerImpl>(mgr, config);
}
if (!config.model_config.wenet_ctc.model.empty()) {
if (!config.model_config.wenet_ctc.model.empty() ||
!config.model_config.zipformer2_ctc.model.empty()) {
return std::make_unique<OnlineRecognizerCtcImpl>(mgr, config);
}

View File

@@ -1,4 +1,4 @@
// sherpa-onnx/csrc/online-paraformer-model.cc
// sherpa-onnx/csrc/online-wenet-ctc-model.cc
//
// Copyright (c) 2023 Xiaomi Corporation
@@ -239,4 +239,21 @@ std::vector<Ort::Value> OnlineWenetCtcModel::GetInitStates() const {
return impl_->GetInitStates();
}
std::vector<Ort::Value> OnlineWenetCtcModel::StackStates(
std::vector<std::vector<Ort::Value>> states) const {
if (states.size() != 1) {
SHERPA_ONNX_LOGE("wenet CTC model supports only batch_size==1. Given: %d",
static_cast<int32_t>(states.size()));
}
return std::move(states[0]);
}
std::vector<std::vector<Ort::Value>> OnlineWenetCtcModel::UnStackStates(
std::vector<Ort::Value> states) const {
std::vector<std::vector<Ort::Value>> ans(1);
ans[0] = std::move(states);
return ans;
}
} // namespace sherpa_onnx

View File

@@ -35,6 +35,12 @@ class OnlineWenetCtcModel : public OnlineCtcModel {
// - offset
std::vector<Ort::Value> GetInitStates() const override;
std::vector<Ort::Value> StackStates(
std::vector<std::vector<Ort::Value>> states) const override;
std::vector<std::vector<Ort::Value>> UnStackStates(
std::vector<Ort::Value> states) const override;
/**
*
* @param x A 3-D tensor of shape (N, T, C). N has to be 1.
@@ -63,6 +69,8 @@ class OnlineWenetCtcModel : public OnlineCtcModel {
// before we process the next chunk.
int32_t ChunkShift() const override;
bool SupportBatchProcessing() const override { return false; }
private:
class Impl;
std::unique_ptr<Impl> impl_;

View File

@@ -0,0 +1,41 @@
// sherpa-onnx/csrc/online-zipformer2-ctc-model-config.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/online-zipformer2-ctc-model-config.h"
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"
namespace sherpa_onnx {
void OnlineZipformer2CtcModelConfig::Register(ParseOptions *po) {
po->Register("zipformer2-ctc-model", &model,
"Path to CTC model.onnx. See also "
"https://github.com/k2-fsa/icefall/pull/1413");
}
bool OnlineZipformer2CtcModelConfig::Validate() const {
if (model.empty()) {
SHERPA_ONNX_LOGE("--zipformer2-ctc-model is empty!");
return false;
}
if (!FileExists(model)) {
SHERPA_ONNX_LOGE("--zipformer2-ctc-model %s does not exist", model.c_str());
return false;
}
return true;
}
std::string OnlineZipformer2CtcModelConfig::ToString() const {
std::ostringstream os;
os << "OnlineZipformer2CtcModelConfig(";
os << "model=\"" << model << "\")";
return os.str();
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,29 @@
// sherpa-onnx/csrc/online-zipformer2-ctc-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_CONFIG_H_
#define SHERPA_ONNX_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_CONFIG_H_
#include <string>
#include "sherpa-onnx/csrc/parse-options.h"
namespace sherpa_onnx {
struct OnlineZipformer2CtcModelConfig {
std::string model;
OnlineZipformer2CtcModelConfig() = default;
explicit OnlineZipformer2CtcModelConfig(const std::string &model)
: model(model) {}
void Register(ParseOptions *po);
bool Validate() const;
std::string ToString() const;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_CONFIG_H_

View File

@@ -0,0 +1,464 @@
// sherpa-onnx/csrc/online-zipformer2-ctc-model.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/csrc/online-zipformer2-ctc-model.h"
#include <assert.h>
#include <math.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <string>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/cat.h"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#include "sherpa-onnx/csrc/session.h"
#include "sherpa-onnx/csrc/text-utils.h"
#include "sherpa-onnx/csrc/unbind.h"
namespace sherpa_onnx {
class OnlineZipformer2CtcModel::Impl {
public:
explicit Impl(const OnlineModelConfig &config)
: config_(config),
env_(ORT_LOGGING_LEVEL_ERROR),
sess_opts_(GetSessionOptions(config)),
allocator_{} {
{
auto buf = ReadFile(config.zipformer2_ctc.model);
Init(buf.data(), buf.size());
}
}
#if __ANDROID_API__ >= 9
Impl(AAssetManager *mgr, const OnlineModelConfig &config)
: config_(config),
env_(ORT_LOGGING_LEVEL_WARNING),
sess_opts_(GetSessionOptions(config)),
allocator_{} {
{
auto buf = ReadFile(mgr, config.zipformer2_ctc.model);
Init(buf.data(), buf.size());
}
}
#endif
std::vector<Ort::Value> Forward(Ort::Value features,
std::vector<Ort::Value> states) {
std::vector<Ort::Value> inputs;
inputs.reserve(1 + states.size());
inputs.push_back(std::move(features));
for (auto &v : states) {
inputs.push_back(std::move(v));
}
return sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(),
output_names_ptr_.data(), output_names_ptr_.size());
}
int32_t VocabSize() const { return vocab_size_; }
int32_t ChunkLength() const { return T_; }
int32_t ChunkShift() const { return decode_chunk_len_; }
OrtAllocator *Allocator() const { return allocator_; }
// Return a vector containing 3 tensors
// - attn_cache
// - conv_cache
// - offset
std::vector<Ort::Value> GetInitStates() {
std::vector<Ort::Value> ans;
ans.reserve(initial_states_.size());
for (auto &s : initial_states_) {
ans.push_back(View(&s));
}
return ans;
}
std::vector<Ort::Value> StackStates(
std::vector<std::vector<Ort::Value>> states) const {
int32_t batch_size = static_cast<int32_t>(states.size());
int32_t num_encoders = static_cast<int32_t>(num_encoder_layers_.size());
std::vector<const Ort::Value *> buf(batch_size);
std::vector<Ort::Value> ans;
int32_t num_states = static_cast<int32_t>(states[0].size());
ans.reserve(num_states);
for (int32_t i = 0; i != (num_states - 2) / 6; ++i) {
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][6 * i];
}
auto v = Cat(allocator_, buf, 1);
ans.push_back(std::move(v));
}
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][6 * i + 1];
}
auto v = Cat(allocator_, buf, 1);
ans.push_back(std::move(v));
}
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][6 * i + 2];
}
auto v = Cat(allocator_, buf, 1);
ans.push_back(std::move(v));
}
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][6 * i + 3];
}
auto v = Cat(allocator_, buf, 1);
ans.push_back(std::move(v));
}
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][6 * i + 4];
}
auto v = Cat(allocator_, buf, 0);
ans.push_back(std::move(v));
}
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][6 * i + 5];
}
auto v = Cat(allocator_, buf, 0);
ans.push_back(std::move(v));
}
}
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][num_states - 2];
}
auto v = Cat(allocator_, buf, 0);
ans.push_back(std::move(v));
}
{
for (int32_t n = 0; n != batch_size; ++n) {
buf[n] = &states[n][num_states - 1];
}
auto v = Cat<int64_t>(allocator_, buf, 0);
ans.push_back(std::move(v));
}
return ans;
}
std::vector<std::vector<Ort::Value>> UnStackStates(
std::vector<Ort::Value> states) const {
int32_t m = std::accumulate(num_encoder_layers_.begin(),
num_encoder_layers_.end(), 0);
assert(states.size() == m * 6 + 2);
int32_t batch_size = states[0].GetTensorTypeAndShapeInfo().GetShape()[1];
int32_t num_encoders = num_encoder_layers_.size();
std::vector<std::vector<Ort::Value>> ans;
ans.resize(batch_size);
for (int32_t i = 0; i != m; ++i) {
{
auto v = Unbind(allocator_, &states[i * 6], 1);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
{
auto v = Unbind(allocator_, &states[i * 6 + 1], 1);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
{
auto v = Unbind(allocator_, &states[i * 6 + 2], 1);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
{
auto v = Unbind(allocator_, &states[i * 6 + 3], 1);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
{
auto v = Unbind(allocator_, &states[i * 6 + 4], 0);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
{
auto v = Unbind(allocator_, &states[i * 6 + 5], 0);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
}
{
auto v = Unbind(allocator_, &states[m * 6], 0);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
{
auto v = Unbind<int64_t>(allocator_, &states[m * 6 + 1], 0);
assert(v.size() == batch_size);
for (int32_t n = 0; n != batch_size; ++n) {
ans[n].push_back(std::move(v[n]));
}
}
return ans;
}
private:
void Init(void *model_data, size_t model_data_length) {
sess_ = std::make_unique<Ort::Session>(env_, model_data, model_data_length,
sess_opts_);
GetInputNames(sess_.get(), &input_names_, &input_names_ptr_);
GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_);
// get meta data
Ort::ModelMetadata meta_data = sess_->GetModelMetadata();
if (config_.debug) {
std::ostringstream os;
os << "---zipformer2_ctc---\n";
PrintModelMetadata(os, meta_data);
SHERPA_ONNX_LOGE("%s", os.str().c_str());
}
Ort::AllocatorWithDefaultOptions allocator; // used in the macro below
SHERPA_ONNX_READ_META_DATA_VEC(encoder_dims_, "encoder_dims");
SHERPA_ONNX_READ_META_DATA_VEC(query_head_dims_, "query_head_dims");
SHERPA_ONNX_READ_META_DATA_VEC(value_head_dims_, "value_head_dims");
SHERPA_ONNX_READ_META_DATA_VEC(num_heads_, "num_heads");
SHERPA_ONNX_READ_META_DATA_VEC(num_encoder_layers_, "num_encoder_layers");
SHERPA_ONNX_READ_META_DATA_VEC(cnn_module_kernels_, "cnn_module_kernels");
SHERPA_ONNX_READ_META_DATA_VEC(left_context_len_, "left_context_len");
SHERPA_ONNX_READ_META_DATA(T_, "T");
SHERPA_ONNX_READ_META_DATA(decode_chunk_len_, "decode_chunk_len");
{
auto shape =
sess_->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
vocab_size_ = shape[2];
}
if (config_.debug) {
auto print = [](const std::vector<int32_t> &v, const char *name) {
fprintf(stderr, "%s: ", name);
for (auto i : v) {
fprintf(stderr, "%d ", i);
}
fprintf(stderr, "\n");
};
print(encoder_dims_, "encoder_dims");
print(query_head_dims_, "query_head_dims");
print(value_head_dims_, "value_head_dims");
print(num_heads_, "num_heads");
print(num_encoder_layers_, "num_encoder_layers");
print(cnn_module_kernels_, "cnn_module_kernels");
print(left_context_len_, "left_context_len");
SHERPA_ONNX_LOGE("T: %d", T_);
SHERPA_ONNX_LOGE("decode_chunk_len_: %d", decode_chunk_len_);
SHERPA_ONNX_LOGE("vocab_size_: %d", vocab_size_);
}
InitStates();
}
void InitStates() {
int32_t n = static_cast<int32_t>(encoder_dims_.size());
int32_t m = std::accumulate(num_encoder_layers_.begin(),
num_encoder_layers_.end(), 0);
initial_states_.reserve(m * 6 + 2);
for (int32_t i = 0; i != n; ++i) {
int32_t num_layers = num_encoder_layers_[i];
int32_t key_dim = query_head_dims_[i] * num_heads_[i];
int32_t value_dim = value_head_dims_[i] * num_heads_[i];
int32_t nonlin_attn_head_dim = 3 * encoder_dims_[i] / 4;
for (int32_t j = 0; j != num_layers; ++j) {
{
std::array<int64_t, 3> s{left_context_len_[i], 1, key_dim};
auto v =
Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
Fill(&v, 0);
initial_states_.push_back(std::move(v));
}
{
std::array<int64_t, 4> s{1, 1, left_context_len_[i],
nonlin_attn_head_dim};
auto v =
Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
Fill(&v, 0);
initial_states_.push_back(std::move(v));
}
{
std::array<int64_t, 3> s{left_context_len_[i], 1, value_dim};
auto v =
Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
Fill(&v, 0);
initial_states_.push_back(std::move(v));
}
{
std::array<int64_t, 3> s{left_context_len_[i], 1, value_dim};
auto v =
Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
Fill(&v, 0);
initial_states_.push_back(std::move(v));
}
{
std::array<int64_t, 3> s{1, encoder_dims_[i],
cnn_module_kernels_[i] / 2};
auto v =
Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
Fill(&v, 0);
initial_states_.push_back(std::move(v));
}
{
std::array<int64_t, 3> s{1, encoder_dims_[i],
cnn_module_kernels_[i] / 2};
auto v =
Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
Fill(&v, 0);
initial_states_.push_back(std::move(v));
}
}
}
{
std::array<int64_t, 4> s{1, 128, 3, 19};
auto v = Ort::Value::CreateTensor<float>(allocator_, s.data(), s.size());
Fill(&v, 0);
initial_states_.push_back(std::move(v));
}
{
std::array<int64_t, 1> s{1};
auto v =
Ort::Value::CreateTensor<int64_t>(allocator_, s.data(), s.size());
Fill<int64_t>(&v, 0);
initial_states_.push_back(std::move(v));
}
}
private:
OnlineModelConfig config_;
Ort::Env env_;
Ort::SessionOptions sess_opts_;
Ort::AllocatorWithDefaultOptions allocator_;
std::unique_ptr<Ort::Session> sess_;
std::vector<std::string> input_names_;
std::vector<const char *> input_names_ptr_;
std::vector<std::string> output_names_;
std::vector<const char *> output_names_ptr_;
std::vector<Ort::Value> initial_states_;
std::vector<int32_t> encoder_dims_;
std::vector<int32_t> query_head_dims_;
std::vector<int32_t> value_head_dims_;
std::vector<int32_t> num_heads_;
std::vector<int32_t> num_encoder_layers_;
std::vector<int32_t> cnn_module_kernels_;
std::vector<int32_t> left_context_len_;
int32_t T_ = 0;
int32_t decode_chunk_len_ = 0;
int32_t vocab_size_ = 0;
};
OnlineZipformer2CtcModel::OnlineZipformer2CtcModel(
const OnlineModelConfig &config)
: impl_(std::make_unique<Impl>(config)) {}
#if __ANDROID_API__ >= 9
OnlineZipformer2CtcModel::OnlineZipformer2CtcModel(
AAssetManager *mgr, const OnlineModelConfig &config)
: impl_(std::make_unique<Impl>(mgr, config)) {}
#endif
OnlineZipformer2CtcModel::~OnlineZipformer2CtcModel() = default;
std::vector<Ort::Value> OnlineZipformer2CtcModel::Forward(
Ort::Value x, std::vector<Ort::Value> states) const {
return impl_->Forward(std::move(x), std::move(states));
}
int32_t OnlineZipformer2CtcModel::VocabSize() const {
return impl_->VocabSize();
}
int32_t OnlineZipformer2CtcModel::ChunkLength() const {
return impl_->ChunkLength();
}
int32_t OnlineZipformer2CtcModel::ChunkShift() const {
return impl_->ChunkShift();
}
OrtAllocator *OnlineZipformer2CtcModel::Allocator() const {
return impl_->Allocator();
}
std::vector<Ort::Value> OnlineZipformer2CtcModel::GetInitStates() const {
return impl_->GetInitStates();
}
std::vector<Ort::Value> OnlineZipformer2CtcModel::StackStates(
std::vector<std::vector<Ort::Value>> states) const {
return impl_->StackStates(std::move(states));
}
std::vector<std::vector<Ort::Value>> OnlineZipformer2CtcModel::UnStackStates(
std::vector<Ort::Value> states) const {
return impl_->UnStackStates(std::move(states));
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,80 @@
// sherpa-onnx/csrc/online-zipformer2-ctc-model.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_H_
#define SHERPA_ONNX_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_H_
#include <memory>
#include <utility>
#include <vector>
#if __ANDROID_API__ >= 9
#include "android/asset_manager.h"
#include "android/asset_manager_jni.h"
#endif
#include "onnxruntime_cxx_api.h" // NOLINT
#include "sherpa-onnx/csrc/online-ctc-model.h"
#include "sherpa-onnx/csrc/online-model-config.h"
namespace sherpa_onnx {
class OnlineZipformer2CtcModel : public OnlineCtcModel {
public:
explicit OnlineZipformer2CtcModel(const OnlineModelConfig &config);
#if __ANDROID_API__ >= 9
OnlineZipformer2CtcModel(AAssetManager *mgr, const OnlineModelConfig &config);
#endif
~OnlineZipformer2CtcModel() override;
// A list of tensors.
// See also
// https://github.com/k2-fsa/icefall/pull/1413
// and
// https://github.com/k2-fsa/icefall/pull/1415
std::vector<Ort::Value> GetInitStates() const override;
std::vector<Ort::Value> StackStates(
std::vector<std::vector<Ort::Value>> states) const override;
std::vector<std::vector<Ort::Value>> UnStackStates(
std::vector<Ort::Value> states) const override;
/**
*
* @param x A 3-D tensor of shape (N, T, C). N has to be 1.
* @param states It is from GetInitStates() or returned from this method.
*
* @return Return a list of tensors
* - ans[0] contains log_probs, of shape (N, T, C)
* - ans[1:] contains next_states
*/
std::vector<Ort::Value> Forward(
Ort::Value x, std::vector<Ort::Value> states) const override;
/** Return the vocabulary size of the model
*/
int32_t VocabSize() const override;
/** Return an allocator for allocating memory
*/
OrtAllocator *Allocator() const override;
// The model accepts this number of frames before subsampling as input
int32_t ChunkLength() const override;
// Similar to frame_shift in feature extractor, after processing
// ChunkLength() frames, we advance by ChunkShift() frames
// before we process the next chunk.
int32_t ChunkShift() const override;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace sherpa_onnx
#endif // SHERPA_ONNX_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_H_

View File

@@ -26,6 +26,8 @@ int main(int32_t argc, char *argv[]) {
const char *kUsageMessage = R"usage(
Usage:
(1) Streaming transducer
./bin/sherpa-onnx \
--tokens=/path/to/tokens.txt \
--encoder=/path/to/encoder.onnx \
@@ -36,6 +38,30 @@ Usage:
--decoding-method=greedy_search \
/path/to/foo.wav [bar.wav foobar.wav ...]
(2) Streaming zipformer2 CTC
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2
./bin/sherpa-onnx \
--debug=1 \
--zipformer2-ctc-model=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
--tokens=./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/tokens.txt \
./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000000.wav \
./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000001.wav \
./sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13/test_wavs/DEV_T0000000002.wav
(3) Streaming paraformer
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2
./bin/sherpa-onnx \
--tokens=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \
--paraformer-encoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx \
--paraformer-decoder=./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx \
./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav
Note: It supports decoding multiple files in batches
Default value for num_threads is 2.

View File

@@ -8,9 +8,6 @@
#include <fstream>
#include <sstream>
#include "sherpa-onnx/csrc/base64-decode.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
#if __ANDROID_API__ >= 9
#include <strstream>
@@ -18,6 +15,9 @@
#include "android/asset_manager_jni.h"
#endif
#include "sherpa-onnx/csrc/base64-decode.h"
#include "sherpa-onnx/csrc/onnx-utils.h"
namespace sherpa_onnx {
SymbolTable::SymbolTable(const std::string &filename) {

View File

@@ -262,22 +262,34 @@ static OnlineRecognizerConfig GetConfig(JNIEnv *env, jobject config) {
fid = env->GetFieldID(model_config_cls, "paraformer",
"Lcom/k2fsa/sherpa/onnx/OnlineParaformerModelConfig;");
jobject paraformer_config = env->GetObjectField(model_config, fid);
jclass paraformer_config_config_cls = env->GetObjectClass(paraformer_config);
jclass paraformer_config_cls = env->GetObjectClass(paraformer_config);
fid = env->GetFieldID(paraformer_config_config_cls, "encoder",
"Ljava/lang/String;");
fid = env->GetFieldID(paraformer_config_cls, "encoder", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(paraformer_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model_config.paraformer.encoder = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(paraformer_config_config_cls, "decoder",
"Ljava/lang/String;");
fid = env->GetFieldID(paraformer_config_cls, "decoder", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(paraformer_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model_config.paraformer.decoder = p;
env->ReleaseStringUTFChars(s, p);
// streaming zipformer2 CTC
fid =
env->GetFieldID(model_config_cls, "zipformer2Ctc",
"Lcom/k2fsa/sherpa/onnx/OnlineZipformer2CtcModelConfig;");
jobject zipformer2_ctc_config = env->GetObjectField(model_config, fid);
jclass zipformer2_ctc_config_cls = env->GetObjectClass(zipformer2_ctc_config);
fid =
env->GetFieldID(zipformer2_ctc_config_cls, "model", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(zipformer2_ctc_config, fid);
p = env->GetStringUTFChars(s, nullptr);
ans.model_config.zipformer2_ctc.model = p;
env->ReleaseStringUTFChars(s, p);
fid = env->GetFieldID(model_config_cls, "tokens", "Ljava/lang/String;");
s = (jstring)env->GetObjectField(model_config, fid);
p = env->GetStringUTFChars(s, nullptr);

View File

@@ -27,6 +27,7 @@ pybind11_add_module(_sherpa_onnx
online-stream.cc
online-transducer-model-config.cc
online-wenet-ctc-model-config.cc
online-zipformer2-ctc-model-config.cc
sherpa-onnx.cc
silero-vad-model-config.cc
vad-model-config.cc

View File

@@ -58,6 +58,7 @@ void PybindOfflineModelConfig(py::module *m) {
.def_readwrite("debug", &PyClass::debug)
.def_readwrite("provider", &PyClass::provider)
.def_readwrite("model_type", &PyClass::model_type)
.def("validate", &PyClass::Validate)
.def("__str__", &PyClass::ToString);
}

View File

@@ -12,6 +12,7 @@
#include "sherpa-onnx/python/csrc/online-paraformer-model-config.h"
#include "sherpa-onnx/python/csrc/online-transducer-model-config.h"
#include "sherpa-onnx/python/csrc/online-wenet-ctc-model-config.h"
#include "sherpa-onnx/python/csrc/online-zipformer2-ctc-model-config.h"
namespace sherpa_onnx {
@@ -19,26 +20,31 @@ void PybindOnlineModelConfig(py::module *m) {
PybindOnlineTransducerModelConfig(m);
PybindOnlineParaformerModelConfig(m);
PybindOnlineWenetCtcModelConfig(m);
PybindOnlineZipformer2CtcModelConfig(m);
using PyClass = OnlineModelConfig;
py::class_<PyClass>(*m, "OnlineModelConfig")
.def(py::init<const OnlineTransducerModelConfig &,
const OnlineParaformerModelConfig &,
const OnlineWenetCtcModelConfig &, const std::string &,
const OnlineWenetCtcModelConfig &,
const OnlineZipformer2CtcModelConfig &, const std::string &,
int32_t, bool, const std::string &, const std::string &>(),
py::arg("transducer") = OnlineTransducerModelConfig(),
py::arg("paraformer") = OnlineParaformerModelConfig(),
py::arg("wenet_ctc") = OnlineWenetCtcModelConfig(),
py::arg("zipformer2_ctc") = OnlineZipformer2CtcModelConfig(),
py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false,
py::arg("provider") = "cpu", py::arg("model_type") = "")
.def_readwrite("transducer", &PyClass::transducer)
.def_readwrite("paraformer", &PyClass::paraformer)
.def_readwrite("wenet_ctc", &PyClass::wenet_ctc)
.def_readwrite("zipformer2_ctc", &PyClass::zipformer2_ctc)
.def_readwrite("tokens", &PyClass::tokens)
.def_readwrite("num_threads", &PyClass::num_threads)
.def_readwrite("debug", &PyClass::debug)
.def_readwrite("provider", &PyClass::provider)
.def_readwrite("model_type", &PyClass::model_type)
.def("validate", &PyClass::Validate)
.def("__str__", &PyClass::ToString);
}

View File

@@ -0,0 +1,22 @@
// sherpa-onnx/python/csrc/online-zipformer2-ctc-model-config.cc
//
// Copyright (c) 2023 Xiaomi Corporation
#include "sherpa-onnx/python/csrc/online-zipformer2-ctc-model-config.h"
#include <string>
#include <vector>
#include "sherpa-onnx/csrc/online-zipformer2-ctc-model-config.h"
namespace sherpa_onnx {
void PybindOnlineZipformer2CtcModelConfig(py::module *m) {
using PyClass = OnlineZipformer2CtcModelConfig;
py::class_<PyClass>(*m, "OnlineZipformer2CtcModelConfig")
.def(py::init<const std::string &>(), py::arg("model"))
.def_readwrite("model", &PyClass::model)
.def("__str__", &PyClass::ToString);
}
} // namespace sherpa_onnx

View File

@@ -0,0 +1,16 @@
// sherpa-onnx/python/csrc/online-zipformer2-ctc-model-config.h
//
// Copyright (c) 2023 Xiaomi Corporation
#ifndef SHERPA_ONNX_PYTHON_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_CONFIG_H_
#define SHERPA_ONNX_PYTHON_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_CONFIG_H_
#include "sherpa-onnx/python/csrc/sherpa-onnx.h"
namespace sherpa_onnx {
void PybindOnlineZipformer2CtcModelConfig(py::module *m);
}
#endif // SHERPA_ONNX_PYTHON_CSRC_ONLINE_ZIPFORMER2_CTC_MODEL_CONFIG_H_

View File

@@ -8,11 +8,14 @@ from _sherpa_onnx import (
OnlineLMConfig,
OnlineModelConfig,
OnlineParaformerModelConfig,
OnlineRecognizer as _Recognizer,
)
from _sherpa_onnx import OnlineRecognizer as _Recognizer
from _sherpa_onnx import (
OnlineRecognizerConfig,
OnlineStream,
OnlineTransducerModelConfig,
OnlineWenetCtcModelConfig,
OnlineZipformer2CtcModelConfig,
)
@@ -272,6 +275,101 @@ class OnlineRecognizer(object):
self.config = recognizer_config
return self
@classmethod
def from_zipformer2_ctc(
cls,
tokens: str,
model: str,
num_threads: int = 2,
sample_rate: float = 16000,
feature_dim: int = 80,
enable_endpoint_detection: bool = False,
rule1_min_trailing_silence: float = 2.4,
rule2_min_trailing_silence: float = 1.2,
rule3_min_utterance_length: float = 20.0,
decoding_method: str = "greedy_search",
provider: str = "cpu",
):
"""
Please refer to
`<https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-ctc/index.html>`_
to download pre-trained models for different languages, e.g., Chinese,
English, etc.
Args:
tokens:
Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two
columns::
symbol integer_id
model:
Path to ``model.onnx``.
num_threads:
Number of threads for neural network computation.
sample_rate:
Sample rate of the training data used to train the model.
feature_dim:
Dimension of the feature used to train the model.
enable_endpoint_detection:
True to enable endpoint detection. False to disable endpoint
detection.
rule1_min_trailing_silence:
Used only when enable_endpoint_detection is True. If the duration
of trailing silence in seconds is larger than this value, we assume
an endpoint is detected.
rule2_min_trailing_silence:
Used only when enable_endpoint_detection is True. If we have decoded
something that is nonsilence and if the duration of trailing silence
in seconds is larger than this value, we assume an endpoint is
detected.
rule3_min_utterance_length:
Used only when enable_endpoint_detection is True. If the utterance
length in seconds is larger than this value, we assume an endpoint
is detected.
decoding_method:
The only valid value is greedy_search.
provider:
onnxruntime execution providers. Valid values are: cpu, cuda, coreml.
"""
self = cls.__new__(cls)
_assert_file_exists(tokens)
_assert_file_exists(model)
assert num_threads > 0, num_threads
zipformer2_ctc_config = OnlineZipformer2CtcModelConfig(model=model)
model_config = OnlineModelConfig(
zipformer2_ctc=zipformer2_ctc_config,
tokens=tokens,
num_threads=num_threads,
provider=provider,
)
feat_config = FeatureExtractorConfig(
sampling_rate=sample_rate,
feature_dim=feature_dim,
)
endpoint_config = EndpointConfig(
rule1_min_trailing_silence=rule1_min_trailing_silence,
rule2_min_trailing_silence=rule2_min_trailing_silence,
rule3_min_utterance_length=rule3_min_utterance_length,
)
recognizer_config = OnlineRecognizerConfig(
feat_config=feat_config,
model_config=model_config,
endpoint_config=endpoint_config,
enable_endpoint=enable_endpoint_detection,
decoding_method=decoding_method,
)
self.recognizer = _Recognizer(recognizer_config)
self.config = recognizer_config
return self
@classmethod
def from_wenet_ctc(
cls,
@@ -352,7 +450,6 @@ class OnlineRecognizer(object):
tokens=tokens,
num_threads=num_threads,
provider=provider,
model_type="wenet_ctc",
)
feat_config = FeatureExtractorConfig(

View File

@@ -143,6 +143,57 @@ class TestOnlineRecognizer(unittest.TestCase):
print(f"{wave_filename}\n{result}")
print("-" * 10)
def test_zipformer2_ctc(self):
m = "sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13"
for use_int8 in [True, False]:
name = (
"ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx"
if use_int8
else "ctc-epoch-20-avg-1-chunk-16-left-128.onnx"
)
model = f"{d}/{m}/{name}"
tokens = f"{d}/{m}/tokens.txt"
wave0 = f"{d}/{m}/test_wavs/DEV_T0000000000.wav"
wave1 = f"{d}/{m}/test_wavs/DEV_T0000000001.wav"
wave2 = f"{d}/{m}/test_wavs/DEV_T0000000002.wav"
if not Path(model).is_file():
print("skipping test_zipformer2_ctc()")
return
print(f"testing {model}")
recognizer = sherpa_onnx.OnlineRecognizer.from_zipformer2_ctc(
model=model,
tokens=tokens,
num_threads=1,
provider="cpu",
)
streams = []
waves = [wave0, wave1, wave2]
for wave in waves:
s = recognizer.create_stream()
samples, sample_rate = read_wave(wave)
s.accept_waveform(sample_rate, samples)
tail_paddings = np.zeros(int(0.2 * sample_rate), dtype=np.float32)
s.accept_waveform(sample_rate, tail_paddings)
s.input_finished()
streams.append(s)
while True:
ready_list = []
for s in streams:
if recognizer.is_ready(s):
ready_list.append(s)
if len(ready_list) == 0:
break
recognizer.decode_streams(ready_list)
results = [recognizer.get_result(s) for s in streams]
for wave_filename, result in zip(waves, results):
print(f"{wave_filename}\n{result}")
print("-" * 10)
def test_wenet_ctc(self):
models = [
"sherpa-onnx-zh-wenet-aishell",