/** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ /*! * \file fallback.h * \brief */ #ifndef ACLNNFALLBACK_OPAPI_H_ #define ACLNNFALLBACK_OPAPI_H_ #include #include #include #include #include #include "aclnn/aclnn_base.h" #include "fallback_comm.h" #include "error/ops_error.h" #include "runtime/base.h" namespace fallback { using namespace std; using namespace gert; using namespace ge; using namespace std; namespace std_utils { template struct index_sequence {}; template struct make_index_sequence_helper : make_index_sequence_helper {}; template struct make_index_sequence_helper<0, Is...> { using type = index_sequence; }; template using make_index_sequence = typename make_index_sequence_helper::type; } using aclOpExecutor = struct aclOpExecutor; using aclTensor = struct aclTensor; using aclScalar = struct aclScalar; using aclIntArray = struct aclIntArray; using aclFloatArray = struct aclFloatArray; using aclBoolArray = struct aclBoolArray; using aclTensorList = struct aclTensorList; using _aclCreateTensor = aclTensor* (*)(const int64_t* view_dims, uint64_t view_dims_num, aclDataType data_type, const int64_t* stride, int64_t offset, aclFormat format, const int64_t* storage_dims, uint64_t storage_dims_num, void* tensor_data); using _aclCreateScalar = aclScalar* (*)(void* value, aclDataType data_type); using _aclCreateIntArray = aclIntArray* (*)(const int64_t* value, uint64_t size); using _aclCreateFloatArray = aclFloatArray* (*)(const float* value, uint64_t size); using _aclCreateBoolArray = aclBoolArray* (*)(const bool* value, uint64_t size); using _aclCreateTensorList = aclTensorList* (*)(const aclTensor* const* value, uint64_t size); using _aclDestroyTensor = int (*)(const aclTensor* tensor); using _aclDestroyScalar = int (*)(const aclScalar* scalar); using _aclDestroyIntArray = int (*)(const aclIntArray* array); using _aclDestroyFloatArray = int (*)(const aclFloatArray* array); using _aclDestroyBoolArray = int (*)(const aclBoolArray* array); using _aclDestroyTensorList = int (*)(const aclTensorList* array); #define GET_OP_API_FUNC(apiName) reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName)) inline const char* GetOpApiLibName(void) { return "libopapi.so"; } inline const char* GetCustOpApiLibName(void) { return "libcust_opapi.so"; } inline void* GetOpApiFuncAddrInLib(void* handler, const char* libName, const char* apiName) { auto funcAddr = dlsym(handler, apiName); if (funcAddr == nullptr) { OPS_LOG_W("aclnnfallback", "dlsym %s from %s failed, error:%s.", apiName, libName, dlerror()); } return funcAddr; } inline void* GetOpApiLibHandler(const char* libName) { auto handler = dlopen(libName, RTLD_LAZY); if (handler == nullptr) { OPS_LOG_W("aclnnfallback", "dlopen %s failed, error:%s.", libName, dlerror()); } return handler; } inline void* GetAclnnArrdByApiName(const char *apiName) { vector libs = {"libaclnn_ops_infer.so", "libaclnn_ops_train.so", "libaclnn_math.so", "libaclnn_rand.so", "libaclnn_sparse.so", "libaclnn_fft.so"}; for (const auto &libName : libs) { static auto libHandler = GetOpApiLibHandler(libName.c_str()); if (libHandler != nullptr) { auto funcAddr = GetOpApiFuncAddrInLib(libHandler, libName.c_str(), apiName); if (funcAddr != nullptr) { return funcAddr; } } } OPS_LOG_E("aclnnfallback", "api %s can't find in any aclnn lib.", apiName); return nullptr; } inline void* GetOpApiFuncAddr(const char* apiName) { static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName()); if (custOpApiHandler != nullptr) { auto funcAddr = GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName); if (funcAddr != nullptr) { return funcAddr; } } static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName()); if (opApiHandler != nullptr) { auto funcAddr = GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName); if (funcAddr != nullptr) { return funcAddr; } } OPS_LOG_D("aclnnfallback", "opapi lib is not exist,will use aclnn lib."); return GetAclnnArrdByApiName(apiName); } inline aclTensor* ConvertType(aclTensor* ge_tensor) { return ge_tensor; } inline aclIntArray* ConvertType(const std::vector &arr) { if (arr.empty()) { return nullptr; } static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray); auto array = aclCreateIntArray(arr.data(), arr.size()); return array; } inline aclDataType GetConvertType(const gert::Tensor* ge_tensor) { // convert data type auto dataType_ge = ge_tensor->GetDataType(); auto dataType = aclDataType::ACL_FLOAT16; if (dataType_ge == DT_FLOAT) { dataType = aclDataType::ACL_FLOAT; } else if (dataType_ge == DT_BF16) { dataType = aclDataType::ACL_BF16; } else if (dataType_ge == DT_BOOL) { dataType = aclDataType::ACL_BOOL; } else if (dataType_ge == DT_INT64) { dataType = aclDataType::ACL_INT64; } else if (dataType_ge == DT_INT32) { dataType = aclDataType::ACL_INT32; } else if (dataType_ge == DT_UINT64) { dataType = aclDataType::ACL_UINT64; } else if (dataType_ge == DT_UINT32) { dataType = aclDataType::ACL_UINT32; } else if (dataType_ge == DT_INT8) { dataType = aclDataType::ACL_INT8; } else if (dataType_ge == DT_UINT8) { dataType = aclDataType::ACL_UINT8; } else if (dataType_ge == DT_INT4) { dataType = aclDataType::ACL_INT4; } else { dataType = aclDataType::ACL_FLOAT16; } return dataType; } inline aclTensor* ConvertType(const gert::Tensor* ge_tensor) { if (ge_tensor == nullptr) { return nullptr; } static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); OPS_ERR_IF(aclCreateTensor == nullptr, OPS_LOG_E("aclnnfallback", "aclCreateTensor nullptr"), return nullptr); void* device_addr = nullptr; auto tensor_place = ge_tensor->GetPlacement(); device_addr = const_cast(ge_tensor->GetAddr()); auto dataType = GetConvertType(ge_tensor); OPS_LOG_D("aclnnfallback", "aclCreateTensor: tensor type is %d", dataType); // convert shape auto gert_shape = ge_tensor->GetStorageShape(); std::vector shape; for (size_t i = 0; i < gert_shape.GetDimNum(); ++i) { shape.push_back(gert_shape.GetDim(i)); } // 计算连续tensor的strides std::vector strides(shape.size(), 1); for (int64_t i = shape.size() - 2; i >= 0; i--) { strides[i] = shape[i + 1] * strides[i + 1]; } aclTensor* out = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND, shape.data(), shape.size(), device_addr); OPS_ERR_IF(out == nullptr, OPS_LOG_E("aclnnfallback", "out nullptr"), return nullptr); return out; } inline aclTensorList* ConvertType(std::vector& ge_tenserList) { OPS_ERR_IF(ge_tenserList.size() == 0, OPS_LOG_E("aclnnfallback", "ge_tenserList size 0"), return nullptr); static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList); OPS_ERR_IF(aclCreateTensorList == nullptr, OPS_LOG_E("aclnnfallback", "ge_tenserList size 0"), return nullptr); std::vector tmp; for (size_t i = 0; i < ge_tenserList.size(); i++) { auto t_acl = ConvertType(ge_tenserList[i]); tmp.push_back(t_acl); } aclTensorList* tensorList = aclCreateTensorList(tmp.data(), tmp.size()); return tensorList; } template inline aclScalar* ConvertScalarType(T value) { static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar); OPS_ERR_IF(aclCreateScalar == nullptr, OPS_LOG_E("aclnnfallback", "aclCreateScalar nullptr"), return nullptr); if (typeid(value) == typeid(float)) { return aclCreateScalar(&value, aclDataType::ACL_FLOAT); } return nullptr; } template T ConvertType(T value) { return value; } inline aclTensor* ConvertMmType(const gert::Tensor* ge_tensor, bool transpose, bool enable_NZ=false) { if (ge_tensor == nullptr) { return nullptr; } auto gert_shape = ge_tensor->GetStorageShape(); if (gert_shape.GetDimNum() <= 1) { return ConvertType(ge_tensor); } static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); OPS_ERR_IF(aclCreateTensor == nullptr, OPS_LOG_E("aclnnfallback", "aclCreateTensor nullptr"), return nullptr); void* device_addr = const_cast(ge_tensor->GetAddr()); // convert data type auto dataType_ge = ge_tensor->GetDataType(); auto dataType = ToAclDataType(dataType_ge); // convert shape std::vector shape; for (size_t i = 0; i < gert_shape.GetDimNum(); ++i) { shape.push_back(gert_shape.GetDim(i)); } // 计算连续tensor的strides std::vector strides(shape.size(), 1); for (int64_t i = shape.size() - 2; i >= 0; i--) { strides[i] = shape[i + 1] * strides[i + 1]; } auto viewShape = shape; // 对于transpose后的tensor对后两维度进行strides, viewShape转换 if (transpose) { // dimM 为倒数第二维, dimN 为倒数第一维度 auto dimM = shape.size() - 2; auto dimN = shape.size() - 1; auto swap = strides[dimN]; strides[dimN] = strides[dimM]; strides[dimM] = swap; // 修改viewShape viewShape[dimN] = shape[dimM]; viewShape[dimM] = shape[dimN]; } auto acl_format = aclFormat::ACL_FORMAT_ND; if (enable_NZ && GetPrimaryFormat(ge_tensor->GetStorageFormat()) == ge::Format::FORMAT_FRACTAL_NZ) { acl_format = aclFormat::ACL_FORMAT_FRACTAL_NZ; } aclTensor* out = aclCreateTensor(viewShape.data(), shape.size(), dataType, strides.data(), 0, acl_format, shape.data(), shape.size(), device_addr); OPS_ERR_IF(out == nullptr, OPS_LOG_E("aclnnfallback", "out nullptr"), return nullptr); return out; } inline void Release(aclTensor* p) { static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor); OPS_ERR_IF(aclDestroyTensor == nullptr, OPS_LOG_E("aclnnfallback", "aclDestroyTensor is null"), return); aclDestroyTensor(p); } inline void Release(aclScalar* p) { static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar); OPS_ERR_IF(aclDestroyScalar == nullptr, OPS_LOG_E("aclnnfallback", "aclDestroyScalar is null"), return); aclDestroyScalar(p); } inline void Release(aclIntArray* p) { static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray); OPS_ERR_IF(aclDestroyIntArray == nullptr, OPS_LOG_E("aclnnfallback", "aclDestroyIntArray is null"), return); aclDestroyIntArray(p); } inline void Release(aclBoolArray* p) { static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray); OPS_ERR_IF(aclDestroyBoolArray == nullptr, OPS_LOG_E("aclnnfallback", "aclDestroyBoolArray is null"), return); aclDestroyBoolArray(p); } inline void Release(aclTensorList* p) { static const auto aclDestroyTensorList = GET_OP_API_FUNC(aclDestroyTensorList); OPS_ERR_IF(aclDestroyTensorList == nullptr, OPS_LOG_E("aclnnfallback", "aclDestroyTensorList is null"), return); aclDestroyTensorList(p); } template void Release(T value) { (void)value; } template void CallRelease(Tuple t, std_utils::index_sequence) { (void)std::initializer_list{(Release(std::get(t)), 0)...}; } template void ReleaseConvertTypes(Tuple& t) { static constexpr auto size = std::tuple_size::value; CallRelease(t, std_utils::make_index_sequence{}); } template auto ConvertTypes(Ts&... args) -> decltype(std::make_tuple(ConvertType(args)...)) { auto tp = std::make_tuple(ConvertType(args)...); return tp; } template auto call(Function f, Tuple t, std_utils::index_sequence) -> int { return f(std::get(t)...); } template auto call(Function f, Tuple t) -> int { static constexpr auto size = std::tuple_size::value; return call(f, t, std_utils::make_index_sequence{}); } template auto ConvertToOpApiFunc(const Tuple& params, void* opApiAddr, std_utils::index_sequence) -> int (*)(typename std::decay(params))>::type...) { using OpApiFunc = int (*)(typename std::decay(params))>::type...); auto func = reinterpret_cast(opApiAddr); return func; } template auto ConvertToOpApiFunc(const Tuple& params, void* opApiAddr) -> typename std::enable_if::value != 0, decltype(ConvertToOpApiFunc(params, opApiAddr, std_utils::make_index_sequence::value>{}))>::type { static constexpr auto size = std::tuple_size::value; return ConvertToOpApiFunc(params, opApiAddr, std_utils::make_index_sequence{}); } template class ConvertedParams { public: ConvertedParams(Tuple&& convertedParams) : convertedParams_(std::move(convertedParams)){}; ConvertedParams(ConvertedParams&& other) : convertedParams_(std::move(other.convertedParams_)) { other.validParams_ = false; }; ConvertedParams& operator=(ConvertedParams&& other) { if (this == &other) { return *this; } convertedParams_ = std::move(other.convertedParams_); validParams_ = true; other.validParams_ = false; return *this; } ConvertedParams() = delete; ConvertedParams(const ConvertedParams& other) = delete; ConvertedParams& operator=(const ConvertedParams& other) = delete; ~ConvertedParams() { if (validParams_) { ReleaseConvertTypes(convertedParams_); } } const Tuple& GetConvertedParams() const { return convertedParams_; } private: Tuple convertedParams_; bool validParams_{true}; }; using InitHugeMemThreadLocal = int (*)(void*, bool); using UnInitHugeMemThreadLocal = void (*)(void*, bool); using ReleaseHugeMem = void (*)(void*, bool); using PTAGetExecCache = aclOpExecutor* (*)(uint64_t, uint64_t*); using InitPTACacheThreadLocal = void (*)(); using SetPTAHashKey = void (*)(uint64_t); using CanUsePTACache = bool (*)(const char*); using ResetCacheThreadLocal = void (*)(); #define EXEC_OPAPI_CMD(aclnn_api, ...) \ ({ \ static auto ret = GRAPH_SUCCESS; \ do { \ static const auto ResetCacheThreadLocalAddr = GetOpApiFuncAddr("ResetCacheThreadLocal"); \ static const auto getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ if (getWorkspaceSizeFuncAddr == nullptr || opApiFuncAddr == nullptr || ResetCacheThreadLocalAddr == nullptr) { \ OPS_LOG_E("aclnnfallback", "%s or %s not in %s or %s or ResetCacheThreadLocal not found.", \ #aclnn_api "GetWorkspaceSize", #aclnn_api, GetOpApiLibName(), GetOpApiLibName()); \ ret = GRAPH_FAILED; \ break; \ } \ auto ResetCacheThreadLocalFunc = reinterpret_cast(ResetCacheThreadLocalAddr); \ ResetCacheThreadLocalFunc(); \ uint64_t workspace_size = 0; \ uint64_t* workspace_size_addr = &workspace_size; \ aclOpExecutor* executor = nullptr; \ aclOpExecutor** executor_addr = &executor; \ auto converted_params = ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr); \ static auto getWorkspaceSizeFunc = ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ if (workspace_status != 0) { \ OPS_LOG_E("aclnnfallback", "call %s failed:", #aclnn_api); \ ret = GRAPH_FAILED; \ break; \ } \ void* workspace_addr = nullptr; \ if (workspace_size > 0) { \ workspace_addr = host_api_ctx->MallocWorkspace(workspace_size); \ if (workspace_addr == nullptr) { \ OPS_LOG_E("aclnnfallback", "call %s allocate workspace failed", #aclnn_api); \ ret = GRAPH_FAILED; \ break; \ } \ } \ auto acl_stream = host_api_ctx->GetStream(); \ auto acl_call = [converted_params, workspace_addr, workspace_size, host_api_ctx, acl_stream, \ executor]() -> int { \ using OpApiFunc = int (*)(void*, uint64_t, aclOpExecutor*, const aclrtStream); \ OpApiFunc opApiFunc = reinterpret_cast(opApiFuncAddr); \ auto api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream); \ ReleaseConvertTypes(converted_params); \ host_api_ctx->FreeWorkspace(); \ if (api_ret != 0) { \ OPS_LOG_E("aclnnfallback", "call %s allocate workspace failed api_ret: %d", #aclnn_api, api_ret); \ return GRAPH_FAILED; \ } \ return api_ret; \ }; \ \ ret = acl_call(); \ } while (false); \ (ret); \ }) } // namespace fallback #endif // ACLNNFALLBACK_OPAPI_H_