Add config for TensorRT and CUDA execution provider (#992)
Signed-off-by: manickavela1998@gmail.com <manickavela1998@gmail.com> Signed-off-by: manickavela1998@gmail.com <manickavela.arumugam@uniphore.com>
This commit is contained in:
@@ -87,6 +87,7 @@ set(sources
|
||||
packed-sequence.cc
|
||||
pad-sequence.cc
|
||||
parse-options.cc
|
||||
provider-config.cc
|
||||
provider.cc
|
||||
resample.cc
|
||||
session.cc
|
||||
|
||||
@@ -16,6 +16,7 @@ void OnlineModelConfig::Register(ParseOptions *po) {
|
||||
wenet_ctc.Register(po);
|
||||
zipformer2_ctc.Register(po);
|
||||
nemo_ctc.Register(po);
|
||||
provider_config.Register(po);
|
||||
|
||||
po->Register("tokens", &tokens, "Path to tokens.txt");
|
||||
|
||||
@@ -29,9 +30,6 @@ void OnlineModelConfig::Register(ParseOptions *po) {
|
||||
po->Register("debug", &debug,
|
||||
"true to print model information while loading it.");
|
||||
|
||||
po->Register("provider", &provider,
|
||||
"Specify a provider to use: cpu, cuda, coreml");
|
||||
|
||||
po->Register("modeling-unit", &modeling_unit,
|
||||
"The modeling unit of the model, commonly used units are bpe, "
|
||||
"cjkchar, cjkchar+bpe, etc. Currently, it is needed only when "
|
||||
@@ -87,6 +85,10 @@ bool OnlineModelConfig::Validate() const {
|
||||
return nemo_ctc.Validate();
|
||||
}
|
||||
|
||||
if (!provider_config.Validate()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return transducer.Validate();
|
||||
}
|
||||
|
||||
@@ -99,11 +101,11 @@ std::string OnlineModelConfig::ToString() const {
|
||||
os << "wenet_ctc=" << wenet_ctc.ToString() << ", ";
|
||||
os << "zipformer2_ctc=" << zipformer2_ctc.ToString() << ", ";
|
||||
os << "nemo_ctc=" << nemo_ctc.ToString() << ", ";
|
||||
os << "provider_config=" << provider_config.ToString() << ", ";
|
||||
os << "tokens=\"" << tokens << "\", ";
|
||||
os << "num_threads=" << num_threads << ", ";
|
||||
os << "warm_up=" << warm_up << ", ";
|
||||
os << "debug=" << (debug ? "True" : "False") << ", ";
|
||||
os << "provider=\"" << provider << "\", ";
|
||||
os << "model_type=\"" << model_type << "\", ";
|
||||
os << "modeling_unit=\"" << modeling_unit << "\", ";
|
||||
os << "bpe_vocab=\"" << bpe_vocab << "\")";
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "sherpa-onnx/csrc/online-transducer-model-config.h"
|
||||
#include "sherpa-onnx/csrc/online-wenet-ctc-model-config.h"
|
||||
#include "sherpa-onnx/csrc/online-zipformer2-ctc-model-config.h"
|
||||
#include "sherpa-onnx/csrc/provider-config.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
@@ -20,11 +21,11 @@ struct OnlineModelConfig {
|
||||
OnlineWenetCtcModelConfig wenet_ctc;
|
||||
OnlineZipformer2CtcModelConfig zipformer2_ctc;
|
||||
OnlineNeMoCtcModelConfig nemo_ctc;
|
||||
ProviderConfig provider_config;
|
||||
std::string tokens;
|
||||
int32_t num_threads = 1;
|
||||
int32_t warm_up = 0;
|
||||
bool debug = false;
|
||||
std::string provider = "cpu";
|
||||
|
||||
// Valid values:
|
||||
// - conformer, conformer transducer from icefall
|
||||
@@ -50,8 +51,9 @@ struct OnlineModelConfig {
|
||||
const OnlineWenetCtcModelConfig &wenet_ctc,
|
||||
const OnlineZipformer2CtcModelConfig &zipformer2_ctc,
|
||||
const OnlineNeMoCtcModelConfig &nemo_ctc,
|
||||
const ProviderConfig &provider_config,
|
||||
const std::string &tokens, int32_t num_threads,
|
||||
int32_t warm_up, bool debug, const std::string &provider,
|
||||
int32_t warm_up, bool debug,
|
||||
const std::string &model_type,
|
||||
const std::string &modeling_unit,
|
||||
const std::string &bpe_vocab)
|
||||
@@ -60,11 +62,11 @@ struct OnlineModelConfig {
|
||||
wenet_ctc(wenet_ctc),
|
||||
zipformer2_ctc(zipformer2_ctc),
|
||||
nemo_ctc(nemo_ctc),
|
||||
provider_config(provider_config),
|
||||
tokens(tokens),
|
||||
num_threads(num_threads),
|
||||
warm_up(warm_up),
|
||||
debug(debug),
|
||||
provider(provider),
|
||||
model_type(model_type),
|
||||
modeling_unit(modeling_unit),
|
||||
bpe_vocab(bpe_vocab) {}
|
||||
|
||||
143
sherpa-onnx/csrc/provider-config.cc
Normal file
143
sherpa-onnx/csrc/provider-config.cc
Normal file
@@ -0,0 +1,143 @@
|
||||
// sherpa-onnx/csrc/provider-config.cc
|
||||
//
|
||||
// Copyright (c) 2024 Uniphore (Author: Manickavela)
|
||||
|
||||
#include "sherpa-onnx/csrc/provider-config.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "sherpa-onnx/csrc/file-utils.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
void CudaConfig::Register(ParseOptions *po) {
|
||||
po->Register("cuda-cudnn-conv-algo-search", &cudnn_conv_algo_search,
|
||||
"CuDNN convolution algrorithm search");
|
||||
}
|
||||
|
||||
bool CudaConfig::Validate() const {
|
||||
if (cudnn_conv_algo_search < 1 || cudnn_conv_algo_search > 3) {
|
||||
SHERPA_ONNX_LOGE("cudnn_conv_algo_search: '%d' is not a valid option."
|
||||
"Options : [1,3]. Check OnnxRT docs",
|
||||
cudnn_conv_algo_search);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string CudaConfig::ToString() const {
|
||||
std::ostringstream os;
|
||||
|
||||
os << "CudaConfig(";
|
||||
os << "cudnn_conv_algo_search=" << cudnn_conv_algo_search << ")";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
void TensorrtConfig::Register(ParseOptions *po) {
|
||||
po->Register("trt-max-workspace-size", &trt_max_workspace_size,
|
||||
"Set TensorRT EP GPU memory usage limit.");
|
||||
po->Register("trt-max-partition-iterations", &trt_max_partition_iterations,
|
||||
"Limit partitioning iterations for model conversion.");
|
||||
po->Register("trt-min-subgraph-size", &trt_min_subgraph_size,
|
||||
"Set minimum size for subgraphs in partitioning.");
|
||||
po->Register("trt-fp16-enable", &trt_fp16_enable,
|
||||
"Enable FP16 precision for faster performance.");
|
||||
po->Register("trt-detailed-build-log", &trt_detailed_build_log,
|
||||
"Enable detailed logging of build steps.");
|
||||
po->Register("trt-engine-cache-enable", &trt_engine_cache_enable,
|
||||
"Enable caching of TensorRT engines.");
|
||||
po->Register("trt-timing-cache-enable", &trt_timing_cache_enable,
|
||||
"Enable use of timing cache to speed up builds.");
|
||||
po->Register("trt-engine-cache-path", &trt_engine_cache_path,
|
||||
"Set path to store cached TensorRT engines.");
|
||||
po->Register("trt-timing-cache-path", &trt_timing_cache_path,
|
||||
"Set path for storing timing cache.");
|
||||
po->Register("trt-dump-subgraphs", &trt_dump_subgraphs,
|
||||
"Dump optimized subgraphs for debugging.");
|
||||
}
|
||||
|
||||
bool TensorrtConfig::Validate() const {
|
||||
if (trt_max_workspace_size < 0) {
|
||||
SHERPA_ONNX_LOGE("trt_max_workspace_size: %d is not valid.",
|
||||
trt_max_workspace_size);
|
||||
return false;
|
||||
}
|
||||
if (trt_max_partition_iterations < 0) {
|
||||
SHERPA_ONNX_LOGE("trt_max_partition_iterations: %d is not valid.",
|
||||
trt_max_partition_iterations);
|
||||
return false;
|
||||
}
|
||||
if (trt_min_subgraph_size < 0) {
|
||||
SHERPA_ONNX_LOGE("trt_min_subgraph_size: %d is not valid.",
|
||||
trt_min_subgraph_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string TensorrtConfig::ToString() const {
|
||||
std::ostringstream os;
|
||||
|
||||
os << "TensorrtConfig(";
|
||||
os << "trt_max_workspace_size=" << trt_max_workspace_size << ", ";
|
||||
os << "trt_max_partition_iterations="
|
||||
<< trt_max_partition_iterations << ", ";
|
||||
os << "trt_min_subgraph_size=" << trt_min_subgraph_size << ", ";
|
||||
os << "trt_fp16_enable=\""
|
||||
<< (trt_fp16_enable? "True" : "False") << "\", ";
|
||||
os << "trt_detailed_build_log=\""
|
||||
<< (trt_detailed_build_log? "True" : "False") << "\", ";
|
||||
os << "trt_engine_cache_enable=\""
|
||||
<< (trt_engine_cache_enable? "True" : "False") << "\", ";
|
||||
os << "trt_engine_cache_path=\""
|
||||
<< trt_engine_cache_path.c_str() << "\", ";
|
||||
os << "trt_timing_cache_enable=\""
|
||||
<< (trt_timing_cache_enable? "True" : "False") << "\", ";
|
||||
os << "trt_timing_cache_path=\""
|
||||
<< trt_timing_cache_path.c_str() << "\",";
|
||||
os << "trt_dump_subgraphs=\""
|
||||
<< (trt_dump_subgraphs? "True" : "False") << "\" )";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
void ProviderConfig::Register(ParseOptions *po) {
|
||||
cuda_config.Register(po);
|
||||
trt_config.Register(po);
|
||||
|
||||
po->Register("device", &device, "GPU device index for CUDA and Trt EP");
|
||||
po->Register("provider", &provider,
|
||||
"Specify a provider to use: cpu, cuda, coreml");
|
||||
}
|
||||
|
||||
bool ProviderConfig::Validate() const {
|
||||
if (device < 0) {
|
||||
SHERPA_ONNX_LOGE("device: '%d' is invalid.", device);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (provider == "cuda" && !cuda_config.Validate()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (provider == "trt" && !trt_config.Validate()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string ProviderConfig::ToString() const {
|
||||
std::ostringstream os;
|
||||
|
||||
os << "ProviderConfig(";
|
||||
os << "device=" << device << ", ";
|
||||
os << "provider=\"" << provider << "\", ";
|
||||
os << "cuda_config=" << cuda_config.ToString() << ", ";
|
||||
os << "trt_config=" << trt_config.ToString() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
95
sherpa-onnx/csrc/provider-config.h
Normal file
95
sherpa-onnx/csrc/provider-config.h
Normal file
@@ -0,0 +1,95 @@
|
||||
// sherpa-onnx/csrc/provider-config.h
|
||||
//
|
||||
// Copyright (c) 2024 Uniphore (Author: Manickavela)
|
||||
|
||||
#ifndef SHERPA_ONNX_CSRC_PROVIDER_CONFIG_H_
|
||||
#define SHERPA_ONNX_CSRC_PROVIDER_CONFIG_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/parse-options.h"
|
||||
#include "sherpa-onnx/csrc/macros.h"
|
||||
#include "onnxruntime_cxx_api.h" // NOLINT
|
||||
|
||||
namespace sherpa_onnx {
|
||||
|
||||
struct CudaConfig {
|
||||
int32_t cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
|
||||
|
||||
CudaConfig() = default;
|
||||
explicit CudaConfig(int32_t cudnn_conv_algo_search)
|
||||
: cudnn_conv_algo_search(cudnn_conv_algo_search) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
struct TensorrtConfig {
|
||||
int32_t trt_max_workspace_size = 2147483647;
|
||||
int32_t trt_max_partition_iterations = 10;
|
||||
int32_t trt_min_subgraph_size = 5;
|
||||
bool trt_fp16_enable = true;
|
||||
bool trt_detailed_build_log = false;
|
||||
bool trt_engine_cache_enable = true;
|
||||
bool trt_timing_cache_enable = true;
|
||||
std::string trt_engine_cache_path = ".";
|
||||
std::string trt_timing_cache_path = ".";
|
||||
bool trt_dump_subgraphs = false;
|
||||
|
||||
TensorrtConfig() = default;
|
||||
TensorrtConfig(int32_t trt_max_workspace_size,
|
||||
int32_t trt_max_partition_iterations,
|
||||
int32_t trt_min_subgraph_size,
|
||||
bool trt_fp16_enable,
|
||||
bool trt_detailed_build_log,
|
||||
bool trt_engine_cache_enable,
|
||||
bool trt_timing_cache_enable,
|
||||
const std::string &trt_engine_cache_path,
|
||||
const std::string &trt_timing_cache_path,
|
||||
bool trt_dump_subgraphs)
|
||||
: trt_max_workspace_size(trt_max_workspace_size),
|
||||
trt_max_partition_iterations(trt_max_partition_iterations),
|
||||
trt_min_subgraph_size(trt_min_subgraph_size),
|
||||
trt_fp16_enable(trt_fp16_enable),
|
||||
trt_detailed_build_log(trt_detailed_build_log),
|
||||
trt_engine_cache_enable(trt_engine_cache_enable),
|
||||
trt_timing_cache_enable(trt_timing_cache_enable),
|
||||
trt_engine_cache_path(trt_engine_cache_path),
|
||||
trt_timing_cache_path(trt_timing_cache_path),
|
||||
trt_dump_subgraphs(trt_dump_subgraphs) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
struct ProviderConfig {
|
||||
TensorrtConfig trt_config;
|
||||
CudaConfig cuda_config;
|
||||
std::string provider = "cpu";
|
||||
int32_t device = 0;
|
||||
// device only used for cuda and trt
|
||||
|
||||
ProviderConfig() = default;
|
||||
ProviderConfig(const std::string &provider,
|
||||
int32_t device)
|
||||
: provider(provider), device(device) {}
|
||||
ProviderConfig(const TensorrtConfig &trt_config,
|
||||
const CudaConfig &cuda_config,
|
||||
const std::string &provider,
|
||||
int32_t device)
|
||||
: trt_config(trt_config), cuda_config(cuda_config),
|
||||
provider(provider), device(device) {}
|
||||
|
||||
void Register(ParseOptions *po);
|
||||
bool Validate() const;
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
} // namespace sherpa_onnx
|
||||
|
||||
#endif // SHERPA_ONNX_CSRC_PROVIDER_CONFIG_H_
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "sherpa-onnx/csrc/provider-config.h"
|
||||
namespace sherpa_onnx {
|
||||
|
||||
// Please refer to
|
||||
|
||||
@@ -32,11 +32,13 @@ static void OrtStatusFailure(OrtStatus *status, const char *s) {
|
||||
}
|
||||
|
||||
static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
|
||||
std::string provider_str) {
|
||||
Provider p = StringToProvider(std::move(provider_str));
|
||||
const std::string &provider_str,
|
||||
const ProviderConfig *provider_config = nullptr) {
|
||||
Provider p = StringToProvider(provider_str);
|
||||
|
||||
Ort::SessionOptions sess_opts;
|
||||
sess_opts.SetIntraOpNumThreads(num_threads);
|
||||
|
||||
sess_opts.SetInterOpNumThreads(num_threads);
|
||||
|
||||
std::vector<std::string> available_providers = Ort::GetAvailableProviders();
|
||||
@@ -64,26 +66,51 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
|
||||
break;
|
||||
}
|
||||
case Provider::kTRT: {
|
||||
if (provider_config == nullptr) {
|
||||
SHERPA_ONNX_LOGE("Tensorrt support for Online models ony,"
|
||||
"Must be extended for offline and others");
|
||||
exit(1);
|
||||
}
|
||||
auto trt_config = provider_config->trt_config;
|
||||
struct TrtPairs {
|
||||
const char *op_keys;
|
||||
const char *op_values;
|
||||
};
|
||||
|
||||
auto device_id = std::to_string(provider_config->device);
|
||||
auto trt_max_workspace_size =
|
||||
std::to_string(trt_config.trt_max_workspace_size);
|
||||
auto trt_max_partition_iterations =
|
||||
std::to_string(trt_config.trt_max_partition_iterations);
|
||||
auto trt_min_subgraph_size =
|
||||
std::to_string(trt_config.trt_min_subgraph_size);
|
||||
auto trt_fp16_enable =
|
||||
std::to_string(trt_config.trt_fp16_enable);
|
||||
auto trt_detailed_build_log =
|
||||
std::to_string(trt_config.trt_detailed_build_log);
|
||||
auto trt_engine_cache_enable =
|
||||
std::to_string(trt_config.trt_engine_cache_enable);
|
||||
auto trt_timing_cache_enable =
|
||||
std::to_string(trt_config.trt_timing_cache_enable);
|
||||
auto trt_dump_subgraphs =
|
||||
std::to_string(trt_config.trt_dump_subgraphs);
|
||||
|
||||
std::vector<TrtPairs> trt_options = {
|
||||
{"device_id", "0"},
|
||||
{"trt_max_workspace_size", "2147483648"},
|
||||
{"trt_max_partition_iterations", "10"},
|
||||
{"trt_min_subgraph_size", "5"},
|
||||
{"trt_fp16_enable", "0"},
|
||||
{"trt_detailed_build_log", "0"},
|
||||
{"trt_engine_cache_enable", "1"},
|
||||
{"trt_engine_cache_path", "."},
|
||||
{"trt_timing_cache_enable", "1"},
|
||||
{"trt_timing_cache_path", "."}};
|
||||
{"device_id", device_id.c_str()},
|
||||
{"trt_max_workspace_size", trt_max_workspace_size.c_str()},
|
||||
{"trt_max_partition_iterations", trt_max_partition_iterations.c_str()},
|
||||
{"trt_min_subgraph_size", trt_min_subgraph_size.c_str()},
|
||||
{"trt_fp16_enable", trt_fp16_enable.c_str()},
|
||||
{"trt_detailed_build_log", trt_detailed_build_log.c_str()},
|
||||
{"trt_engine_cache_enable", trt_engine_cache_enable.c_str()},
|
||||
{"trt_engine_cache_path", trt_config.trt_engine_cache_path.c_str()},
|
||||
{"trt_timing_cache_enable", trt_timing_cache_enable.c_str()},
|
||||
{"trt_timing_cache_path", trt_config.trt_timing_cache_path.c_str()},
|
||||
{"trt_dump_subgraphs", trt_dump_subgraphs.c_str()}
|
||||
};
|
||||
// ToDo : Trt configs
|
||||
// "trt_int8_enable"
|
||||
// "trt_int8_use_native_calibration_table"
|
||||
// "trt_dump_subgraphs"
|
||||
|
||||
std::vector<const char *> option_keys, option_values;
|
||||
for (const TrtPairs &pair : trt_options) {
|
||||
@@ -122,10 +149,18 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
|
||||
"CUDAExecutionProvider") != available_providers.end()) {
|
||||
// The CUDA provider is available, proceed with setting the options
|
||||
OrtCUDAProviderOptions options;
|
||||
options.device_id = 0;
|
||||
// Default OrtCudnnConvAlgoSearchExhaustive is extremely slow
|
||||
options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
|
||||
// set more options on need
|
||||
|
||||
if (provider_config != nullptr) {
|
||||
options.device_id = provider_config->device;
|
||||
options.cudnn_conv_algo_search =
|
||||
OrtCudnnConvAlgoSearch(provider_config->cuda_config
|
||||
.cudnn_conv_algo_search);
|
||||
} else {
|
||||
options.device_id = 0;
|
||||
// Default OrtCudnnConvAlgoSearchExhaustive is extremely slow
|
||||
options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchHeuristic;
|
||||
// set more options on need
|
||||
}
|
||||
sess_opts.AppendExecutionProvider_CUDA(options);
|
||||
} else {
|
||||
SHERPA_ONNX_LOGE(
|
||||
@@ -184,7 +219,8 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
|
||||
}
|
||||
|
||||
Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config) {
|
||||
return GetSessionOptionsImpl(config.num_threads, config.provider);
|
||||
return GetSessionOptionsImpl(config.num_threads,
|
||||
config.provider_config.provider, &config.provider_config);
|
||||
}
|
||||
|
||||
Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config) {
|
||||
|
||||
Reference in New Issue
Block a user