// sherpa-onnx/csrc/online-wenet-ctc-model.cc // // Copyright (c) 2023 Xiaomi Corporation #include "sherpa-onnx/csrc/online-wenet-ctc-model.h" #include #include #include #if __ANDROID_API__ >= 9 #include "android/asset_manager.h" #include "android/asset_manager_jni.h" #endif #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/onnx-utils.h" #include "sherpa-onnx/csrc/session.h" #include "sherpa-onnx/csrc/text-utils.h" namespace sherpa_onnx { class OnlineWenetCtcModel::Impl { public: explicit Impl(const OnlineModelConfig &config) : config_(config), env_(ORT_LOGGING_LEVEL_ERROR), sess_opts_(GetSessionOptions(config)), allocator_{} { { auto buf = ReadFile(config.wenet_ctc.model); Init(buf.data(), buf.size()); } } #if __ANDROID_API__ >= 9 Impl(AAssetManager *mgr, const OnlineModelConfig &config) : config_(config), env_(ORT_LOGGING_LEVEL_WARNING), sess_opts_(GetSessionOptions(config)), allocator_{} { { auto buf = ReadFile(mgr, config.wenet_ctc.model); Init(buf.data(), buf.size()); } } #endif std::vector Forward(Ort::Value x, std::vector states) { Ort::Value &attn_cache = states[0]; Ort::Value &conv_cache = states[1]; Ort::Value &offset = states[2]; int32_t chunk_size = config_.wenet_ctc.chunk_size; int32_t left_chunks = config_.wenet_ctc.num_left_chunks; // build attn_mask std::array attn_mask_shape{1, 1, required_cache_size_ + chunk_size}; Ort::Value attn_mask = Ort::Value::CreateTensor( allocator_, attn_mask_shape.data(), attn_mask_shape.size()); bool *p = attn_mask.GetTensorMutableData(); int32_t chunk_idx = offset.GetTensorData()[0] / chunk_size - left_chunks; if (chunk_idx < left_chunks) { std::fill(p, p + required_cache_size_ - chunk_idx * chunk_size, 0); std::fill(p + required_cache_size_ - chunk_idx * chunk_size, p + attn_mask_shape[2], 1); } else { std::fill(p, p + attn_mask_shape[2], 1); } std::array inputs = {std::move(x), View(&offset), View(&required_cache_size_tensor_), std::move(attn_cache), std::move(conv_cache), std::move(attn_mask)}; auto out = sess_->Run({}, input_names_ptr_.data(), inputs.data(), inputs.size(), output_names_ptr_.data(), output_names_ptr_.size()); offset.GetTensorMutableData()[0] += out[0].GetTensorTypeAndShapeInfo().GetShape()[1]; out.push_back(std::move(offset)); return out; } int32_t VocabSize() const { return vocab_size_; } int32_t ChunkLength() const { // When chunk_size is 16, subsampling_factor_ is 4, right_context_ is 6, // the returned value is (16 - 1)*4 + 6 + 1 = 67 return (config_.wenet_ctc.chunk_size - 1) * subsampling_factor_ + right_context_ + 1; } int32_t ChunkShift() const { return config_.wenet_ctc.chunk_size * subsampling_factor_; } OrtAllocator *Allocator() const { return allocator_; } // Return a vector containing 3 tensors // - attn_cache // - conv_cache // - offset std::vector GetInitStates() { std::vector ans; ans.reserve(3); ans.push_back(View(&attn_cache_)); ans.push_back(View(&conv_cache_)); int64_t offset_shape = 1; Ort::Value offset = Ort::Value::CreateTensor(allocator_, &offset_shape, 1); offset.GetTensorMutableData()[0] = required_cache_size_; ans.push_back(std::move(offset)); return ans; } private: void Init(void *model_data, size_t model_data_length) { sess_ = std::make_unique(env_, model_data, model_data_length, sess_opts_); GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); // get meta data Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); if (config_.debug) { std::ostringstream os; PrintModelMetadata(os, meta_data); SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); } Ort::AllocatorWithDefaultOptions allocator; // used in the macro below SHERPA_ONNX_READ_META_DATA(head_, "head"); SHERPA_ONNX_READ_META_DATA(num_blocks_, "num_blocks"); SHERPA_ONNX_READ_META_DATA(output_size_, "output_size"); SHERPA_ONNX_READ_META_DATA(cnn_module_kernel_, "cnn_module_kernel"); SHERPA_ONNX_READ_META_DATA(right_context_, "right_context"); SHERPA_ONNX_READ_META_DATA(subsampling_factor_, "subsampling_factor"); SHERPA_ONNX_READ_META_DATA(vocab_size_, "vocab_size"); required_cache_size_ = config_.wenet_ctc.chunk_size * config_.wenet_ctc.num_left_chunks; InitStates(); } void InitStates() { std::array attn_cache_shape{ num_blocks_, head_, required_cache_size_, output_size_ / head_ * 2}; attn_cache_ = Ort::Value::CreateTensor( allocator_, attn_cache_shape.data(), attn_cache_shape.size()); Fill(&attn_cache_, 0); std::array conv_cache_shape{num_blocks_, 1, output_size_, cnn_module_kernel_ - 1}; conv_cache_ = Ort::Value::CreateTensor( allocator_, conv_cache_shape.data(), conv_cache_shape.size()); Fill(&conv_cache_, 0); int64_t shape = 1; required_cache_size_tensor_ = Ort::Value::CreateTensor(allocator_, &shape, 1); required_cache_size_tensor_.GetTensorMutableData()[0] = required_cache_size_; } private: OnlineModelConfig config_; Ort::Env env_; Ort::SessionOptions sess_opts_; Ort::AllocatorWithDefaultOptions allocator_; std::unique_ptr sess_; std::vector input_names_; std::vector input_names_ptr_; std::vector output_names_; std::vector output_names_ptr_; int32_t head_ = 0; int32_t num_blocks_ = 0; int32_t output_size_ = 0; int32_t cnn_module_kernel_ = 0; int32_t right_context_ = 0; int32_t subsampling_factor_ = 0; int32_t vocab_size_ = 0; int32_t required_cache_size_ = 0; Ort::Value attn_cache_{nullptr}; Ort::Value conv_cache_{nullptr}; Ort::Value required_cache_size_tensor_{nullptr}; }; OnlineWenetCtcModel::OnlineWenetCtcModel(const OnlineModelConfig &config) : impl_(std::make_unique(config)) {} #if __ANDROID_API__ >= 9 OnlineWenetCtcModel::OnlineWenetCtcModel(AAssetManager *mgr, const OnlineModelConfig &config) : impl_(std::make_unique(mgr, config)) {} #endif OnlineWenetCtcModel::~OnlineWenetCtcModel() = default; std::vector OnlineWenetCtcModel::Forward( Ort::Value x, std::vector states) const { return impl_->Forward(std::move(x), std::move(states)); } int32_t OnlineWenetCtcModel::VocabSize() const { return impl_->VocabSize(); } int32_t OnlineWenetCtcModel::ChunkLength() const { return impl_->ChunkLength(); } int32_t OnlineWenetCtcModel::ChunkShift() const { return impl_->ChunkShift(); } OrtAllocator *OnlineWenetCtcModel::Allocator() const { return impl_->Allocator(); } std::vector OnlineWenetCtcModel::GetInitStates() const { return impl_->GetInitStates(); } std::vector OnlineWenetCtcModel::StackStates( std::vector> states) const { if (states.size() != 1) { SHERPA_ONNX_LOGE("wenet CTC model supports only batch_size==1. Given: %d", static_cast(states.size())); } return std::move(states[0]); } std::vector> OnlineWenetCtcModel::UnStackStates( std::vector states) const { std::vector> ans(1); ans[0] = std::move(states); return ans; } } // namespace sherpa_onnx