From 13e110eee4ba0e79fa08d24d9b95a4e13476608e Mon Sep 17 00:00:00 2001 From: zhanhao Date: Tue, 5 Dec 2023 11:42:18 +0800 Subject: [PATCH 1/6] support aclnn --- ads/common/ops/csrc/op_api_common.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ads/common/ops/csrc/op_api_common.h diff --git a/ads/common/ops/csrc/op_api_common.h b/ads/common/ops/csrc/op_api_common.h new file mode 100644 index 00000000..e69de29b -- Gitee From e20405f98e0735ccbd3fab55d7efd21a9eb9a188 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Tue, 5 Dec 2023 11:49:51 +0800 Subject: [PATCH 2/6] support aclnn --- ads/common/__init__.py | 3 +- ads/common/ops/abs.py | 6 + ads/common/ops/csrc/AbsOpApi.cpp | 13 + ads/common/ops/csrc/OpApiCommon.h | 572 ++++++++++++++++++++++++++++ ads/common/ops/csrc/functions.h | 1 + ads/common/ops/csrc/op_api_common.h | 0 ads/common/ops/csrc/pybind.cpp | 1 + 7 files changed, 595 insertions(+), 1 deletion(-) create mode 100644 ads/common/ops/abs.py create mode 100644 ads/common/ops/csrc/AbsOpApi.cpp create mode 100644 ads/common/ops/csrc/OpApiCommon.h delete mode 100644 ads/common/ops/csrc/op_api_common.h diff --git a/ads/common/__init__.py b/ads/common/__init__.py index 0249033a..60441e1e 100644 --- a/ads/common/__init__.py +++ b/ads/common/__init__.py @@ -1 +1,2 @@ -from .ops.scatter_max import scatter_max \ No newline at end of file +from .ops.scatter_max import scatter_max +from .ops.abs import abs \ No newline at end of file diff --git a/ads/common/ops/abs.py b/ads/common/ops/abs.py new file mode 100644 index 00000000..d1fbd785 --- /dev/null +++ b/ads/common/ops/abs.py @@ -0,0 +1,6 @@ +import torch + +import torch_npu +import ads_c + +abs = ads_c.abs \ No newline at end of file diff --git a/ads/common/ops/csrc/AbsOpApi.cpp b/ads/common/ops/csrc/AbsOpApi.cpp new file mode 100644 index 00000000..9e651b44 --- /dev/null +++ b/ads/common/ops/csrc/AbsOpApi.cpp @@ -0,0 +1,13 @@ +#include +#include "torch_npu/csrc/framework/OpCommand.h" +#include "OpApiCommon" +#include "functions.h" + +at::Tensor abs(const at::Tensor& self) { + // construct the output tensor of the NPU + at::Tensor result = at::empty(self.sizes(), self.options()); + + // calculate the output result of the NPU + EXEC_NPU_CMD(aclnnAbs, self, result); + return result; +} diff --git a/ads/common/ops/csrc/OpApiCommon.h b/ads/common/ops/csrc/OpApiCommon.h new file mode 100644 index 00000000..81c19a60 --- /dev/null +++ b/ads/common/ops/csrc/OpApiCommon.h @@ -0,0 +1,572 @@ +#ifndef __OP_API_COMMON_H__ +#define __OP_API_COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" +#include "torch_npu/csrc/framework/interface/EnvVariables.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" + +#define NPU_NAME_SPACE at_npu::native + +typedef struct aclOpExecutor aclOpExecutor; +typedef struct aclTensor aclTensor; +typedef struct aclScalar aclScalar; +typedef struct aclIntArray aclIntArray; +typedef struct aclFloatArray aclFloatArray; +typedef struct aclBoolArray aclBoolArray; +typedef struct aclTensorList aclTensorList; + +typedef aclTensor *(*_aclCreateTensor)( + const int64_t *view_dims, uint64_t view_dims_num, aclDataType data_type, + const int64_t *stride, int64_t offset, aclFormat format, + const int64_t *storage_dims, uint64_t storage_dims_num, void *tensor_data); +typedef aclScalar *(*_aclCreateScalar)(void *value, aclDataType data_type); +typedef aclIntArray *(*_aclCreateIntArray)(const int64_t *value, uint64_t size); +typedef aclFloatArray *(*_aclCreateFloatArray)(const float *value, + uint64_t size); +typedef aclBoolArray *(*_aclCreateBoolArray)(const bool *value, uint64_t size); +typedef aclTensorList *(*_aclCreateTensorList)(const aclTensor *const *value, + uint64_t size); + +typedef int (*_aclDestroyTensor)(const aclTensor *tensor); +typedef int (*_aclDestroyScalar)(const aclScalar *scalar); +typedef int (*_aclDestroyIntArray)(const aclIntArray *array); +typedef int (*_aclDestroyFloatArray)(const aclFloatArray *array); +typedef int (*_aclDestroyBoolArray)(const aclBoolArray *array); +typedef int (*_aclDestroyTensorList)(const aclTensorList *array); + +constexpr int kHashBufSize = 8192; +constexpr int kHashBufMaxSize = kHashBufSize + 1024; +extern thread_local char g_hashBuf[kHashBufSize]; +extern thread_local int g_hashOffset; + +#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \ + _(at::ScalarType::Byte, ACL_UINT8) \ + _(at::ScalarType::Char, ACL_INT8) \ + _(at::ScalarType::Short, ACL_INT16) \ + _(at::ScalarType::Int, ACL_INT32) \ + _(at::ScalarType::Long, ACL_INT64) \ + _(at::ScalarType::Half, ACL_FLOAT16) \ + _(at::ScalarType::Float, ACL_FLOAT) \ + _(at::ScalarType::Double, ACL_DOUBLE) \ + _(at::ScalarType::ComplexHalf, ACL_DT_UNDEFINED) \ + _(at::ScalarType::ComplexFloat, ACL_COMPLEX64) \ + _(at::ScalarType::ComplexDouble, ACL_COMPLEX128) \ + _(at::ScalarType::Bool, ACL_BOOL) \ + _(at::ScalarType::QInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QInt32, ACL_DT_UNDEFINED) \ + _(at::ScalarType::BFloat16, ACL_BF16) \ + _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Undefined, ACL_DT_UNDEFINED) \ + _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED) + +constexpr aclDataType kATenScalarTypeToAclDataTypeTable + [static_cast(at::ScalarType::NumOptions) + 1] = { +#define DEFINE_ENUM(_1, n) n, + AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM) +#undef DEFINE_ENUM +}; + +#define GET_OP_API_FUNC(apiName) \ + reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName)) + +#define MEMCPY_TO_BUF(data_expression, size_expression) \ + if (g_hashOffset + (size_expression) > kHashBufSize) { \ + g_hashOffset = kHashBufMaxSize; \ + return; \ + } \ + memcpy(g_hashBuf + g_hashOffset, data_expression, size_expression); \ + g_hashOffset += size_expression; + +inline const char *GetOpApiLibName(void) { return "libopapi.so"; } + +inline const char *GetCustOpApiLibName(void) { return "libcust_opapi.so"; } + +inline void *GetOpApiFuncAddrInLib(void *handler, const char *libName, + const char *apiName) { + auto funcAddr = dlsym(handler, apiName); + if (funcAddr == nullptr) { + ASCEND_LOGW("dlsym %s from %s failed, error:%s.", apiName, libName, + dlerror()); + } + return funcAddr; +} + +inline void *GetOpApiLibHandler(const char *libName) { + auto handler = dlopen(libName, RTLD_LAZY); + if (handler == nullptr) { + ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror()); + } + return handler; +} + +inline void *GetOpApiFuncAddr(const char *apiName) { + static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName()); + if (custOpApiHandler != nullptr) { + auto funcAddr = + GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName); + if (funcAddr != nullptr) { + return funcAddr; + } + } + + static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName()); + if (opApiHandler == nullptr) { + return nullptr; + } + return GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName); +} + +inline c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor) { + c10::Scalar expScalar; + const at::Tensor *aclInput = &tensor; + if (aclInput->scalar_type() == at::ScalarType::Double) { + double value = *(double *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Long) { + int64_t value = *(int64_t *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Float) { + float value = *(float *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Int) { + int value = *(int *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Half) { + c10::Half value = *(c10::Half *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Bool) { + int8_t value = *(int8_t *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::ComplexDouble) { + c10::complex value = *(c10::complex *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::ComplexFloat) { + c10::complex value = *(c10::complex *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::BFloat16) { + c10::BFloat16 value = *(c10::BFloat16 *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } + return expScalar; +} + +inline at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor) { + at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory(); + int deviceIndex = 0; + return cpuPinMemTensor.to( + c10::Device(at_npu::key::NativeDeviceType, deviceIndex), + cpuPinMemTensor.scalar_type(), true, true); +} + +inline at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar, + at::ScalarType scalar_data_type) { + return CopyTensorHostToDevice( + scalar_to_tensor(cpu_scalar).to(scalar_data_type)); +} + +inline aclTensor *ConvertType(const at::Tensor &at_tensor) { + static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); + if (aclCreateTensor == nullptr) { + return nullptr; + } + + if (!at_tensor.defined()) { + return nullptr; + } + at::ScalarType scalar_data_type = at_tensor.scalar_type(); + aclDataType acl_data_type = + kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; + TORCH_CHECK( + acl_data_type != ACL_DT_UNDEFINED, + std::string(c10::toString(scalar_data_type)) + " has not been supported") + c10::SmallVector storageDims; + // if acl_data_type is ACL_STRING, storageDims is empty. + auto itemsize = at_tensor.itemsize(); + if (itemsize == 0) { + AT_ERROR("When ConvertType, tensor item size of cannot be zero."); + return nullptr; + } + if (acl_data_type != ACL_STRING) { + storageDims.push_back(at_tensor.storage().nbytes() / itemsize); + } + + const auto dimNum = at_tensor.sizes().size(); + aclFormat format = ACL_FORMAT_ND; + switch (dimNum) { + case 3: + format = ACL_FORMAT_NCL; + break; + case 4: + format = ACL_FORMAT_NCHW; + break; + case 5: + format = ACL_FORMAT_NCDHW; + break; + default: + format = ACL_FORMAT_ND; + } + + if (at_tensor.unsafeGetTensorImpl()->is_wrapped_number()) { + c10::Scalar expScalar = ConvertTensorToScalar(at_tensor); + at::Tensor aclInput = CopyScalarToDevice(expScalar, scalar_data_type); + return aclCreateTensor( + aclInput.sizes().data(), aclInput.sizes().size(), acl_data_type, + aclInput.strides().data(), aclInput.storage_offset(), format, + storageDims.data(), storageDims.size(), const_cast(aclInput.storage().data())); + } + + auto acl_tensor = aclCreateTensor( + at_tensor.sizes().data(), at_tensor.sizes().size(), acl_data_type, + at_tensor.strides().data(), at_tensor.storage_offset(), format, + storageDims.data(), storageDims.size(), const_cast(at_tensor.storage().data())); + return acl_tensor; +} + +inline aclScalar *ConvertType(const at::Scalar &at_scalar) { + static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar); + if (aclCreateScalar == nullptr) { + return nullptr; + } + + at::ScalarType scalar_data_type = at_scalar.type(); + aclDataType acl_data_type = + kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; + TORCH_CHECK( + acl_data_type != ACL_DT_UNDEFINED, + std::string(c10::toString(scalar_data_type)) + " has not been supported") + aclScalar *acl_scalar = nullptr; + switch (scalar_data_type) { + case at::ScalarType::Double: { + double value = at_scalar.toDouble(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::Long: { + int64_t value = at_scalar.toLong(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::Bool: { + bool value = at_scalar.toBool(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::ComplexDouble: { + auto value = at_scalar.toComplexDouble(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + default: + acl_scalar = nullptr; + break; + } + return acl_scalar; +} + +inline aclIntArray *ConvertType(const at::IntArrayRef &at_array) { + static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray); + if (aclCreateIntArray == nullptr) { + return nullptr; + } + auto array = aclCreateIntArray(at_array.data(), at_array.size()); + return array; +} + +template +inline aclBoolArray *ConvertType(const std::array &value) { + static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); + if (aclCreateBoolArray == nullptr) { + return nullptr; + } + + auto array = aclCreateBoolArray(value.data(), value.size()); + return array; +} + +inline aclBoolArray *ConvertType(const at::ArrayRef &value) { + static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); + if (aclCreateBoolArray == nullptr) { + return nullptr; + } + + auto array = aclCreateBoolArray(value.data(), value.size()); + return array; +} + +inline aclTensorList *ConvertType(const at::TensorList &at_tensor_list) { + static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList); + if (aclCreateTensorList == nullptr) { + return nullptr; + } + + std::vector tensor_list(at_tensor_list.size()); + for (size_t i = 0; i < at_tensor_list.size(); i++) { + tensor_list[i] = ConvertType(at_tensor_list[i]); + } + auto acl_tensor_list = + aclCreateTensorList(tensor_list.data(), tensor_list.size()); + return acl_tensor_list; +} + +inline aclTensor *ConvertType(const c10::optional &opt_tensor) { + if (opt_tensor.has_value() && opt_tensor.value().defined()) { + return ConvertType(opt_tensor.value()); + } + return nullptr; +} + +inline aclIntArray *ConvertType( + const c10::optional &opt_array) { + if (opt_array.has_value()) { + return ConvertType(opt_array.value()); + } + return nullptr; +} + +inline aclScalar *ConvertType(const c10::optional &opt_scalar) { + if (opt_scalar.has_value()) { + return ConvertType(opt_scalar.value()); + } + return nullptr; +} + +inline aclDataType ConvertType(const at::ScalarType scalarType) { + return kATenScalarTypeToAclDataTypeTable[static_cast(scalarType)]; +} + +template +T ConvertType(T value) { + return value; +} + +template +auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr, + std::index_sequence) { + typedef int (*OpApiFunc)( + typename std::decay(params))>::type...); + auto func = reinterpret_cast(opApiAddr); + return func; +} + +template +auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr) { + static constexpr auto size = std::tuple_size::value; + return ConvertToOpApiFunc(params, opApiAddr, + std::make_index_sequence{}); +} + +inline void Release(aclTensor *p) { + static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor); + if (aclDestroyTensor == nullptr) { + return; + } + aclDestroyTensor(p); +} + +inline void Release(aclScalar *p) { + static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar); + if (aclDestroyScalar == nullptr) { + return; + } + aclDestroyScalar(p); +} + +inline void Release(aclIntArray *p) { + static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray); + if (aclDestroyIntArray == nullptr) { + return; + } + + aclDestroyIntArray(p); +} + +inline void Release(aclBoolArray *p) { + static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray); + if (aclDestroyBoolArray == nullptr) { + return; + } + + aclDestroyBoolArray(p); +} + +inline void Release(aclTensorList *p) { + static const auto aclDestroyTensorList = + GET_OP_API_FUNC(aclDestroyTensorList); + if (aclDestroyTensorList == nullptr) { + return; + } + + aclDestroyTensorList(p); +} + +template +void Release(T value) { + (void)value; +} + +template +void CallRelease(Tuple t, std::index_sequence) { + (void)std::initializer_list{(Release(std::get(t)), 0)...}; +} + +template +void ReleaseConvertTypes(Tuple &t) { + static constexpr auto size = std::tuple_size::value; + CallRelease(t, std::make_index_sequence{}); +} + +template +constexpr auto ConvertTypes(Ts &... args) { + return std::make_tuple(ConvertType(args)...); +} + +template +auto call(Function f, Tuple t, std::index_sequence) { + return f(std::get(t)...); +} + +template +auto call(Function f, Tuple t) { + static constexpr auto size = std::tuple_size::value; + return call(f, t, std::make_index_sequence{}); +} + +template +void AddParamToBuf(const std::array &value) { + MEMCPY_TO_BUF(value.data(), value.size() * sizeof(bool)); +} + +template +void AddParamToBuf(const T &value) { + MEMCPY_TO_BUF(&value, sizeof(T)); +} + +void AddParamToBuf(const at::Tensor &); +void AddParamToBuf(const at::Scalar &); +void AddParamToBuf(const at::IntArrayRef &); +void AddParamToBuf(const at::ArrayRef &); +void AddParamToBuf(const at::TensorList &); +void AddParamToBuf(const c10::optional &); +void AddParamToBuf(const c10::optional &); +void AddParamToBuf(const c10::optional &); +void AddParamToBuf(const at::ScalarType); +void AddParamToBuf(const string &); +void AddParamToBuf(); + +template +void AddParamToBuf(const T &arg, Args &... args) { + AddParamToBuf(arg); + AddParamToBuf(args...); +} + +uint64_t CalcHashId(); +typedef int (*InitHugeMemThreadLocal)(void *, bool); +typedef void (*UnInitHugeMemThreadLocal)(void *, bool); +typedef void (*ReleaseHugeMem)(void *, bool); + + +#define DO_COMPATIBILITY(aclnn_api, originCallExpression) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ + static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ + if (getWorkspaceSizeFuncAddr == nullptr || opApiFuncAddr == nullptr) { \ + ASCEND_LOGW("%s or %sGetWorkspaceSize not in %s, or %s not found. Will call %s", #aclnn_api, #aclnn_api, \ + GetOpApiLibName(), GetOpApiLibName(), #originCallExpression); \ + return originCallExpression; \ + } \ + } while (0) + +#define EXEC_NPU_CMD(aclnn_api, ...) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = \ + GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ + static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ + static const auto initMemAddr = \ + GetOpApiFuncAddr("InitHugeMemThreadLocal"); \ + static const auto unInitMemAddr = \ + GetOpApiFuncAddr("UnInitHugeMemThreadLocal"); \ + static const auto releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem"); \ + TORCH_CHECK( \ + getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr, \ + #aclnn_api, " or ", #aclnn_api "GetWorkspaceSize", " not in ", \ + GetOpApiLibName(), ", or ", GetOpApiLibName(), "not found."); \ + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ + uint64_t workspace_size = 0; \ + uint64_t *workspace_size_addr = &workspace_size; \ + aclOpExecutor *executor = nullptr; \ + aclOpExecutor **executor_addr = &executor; \ + InitHugeMemThreadLocal initMemFunc = \ + reinterpret_cast(initMemAddr); \ + UnInitHugeMemThreadLocal unInitMemFunc = \ + reinterpret_cast(unInitMemAddr); \ + if (initMemFunc) { \ + initMemFunc(nullptr, false); \ + } \ + auto converted_params = \ + ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr); \ + static auto getWorkspaceSizeFunc = \ + ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ + auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ + TORCH_CHECK(workspace_status == 0, \ + "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \ + void *workspace_addr = nullptr; \ + if (workspace_size != 0) { \ + at::TensorOptions options = \ + at::TensorOptions(torch_npu::utils::get_npu_device_type()); \ + auto workspace_tensor = \ + at::empty({workspace_size}, options.dtype(at::kByte)); \ + workspace_addr = const_cast(workspace_tensor.storage().data()); \ + } \ + auto acl_call = [converted_params, workspace_addr, workspace_size, \ + acl_stream, executor]() -> int { \ + typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *, \ + const aclrtStream); \ + OpApiFunc opApiFunc = reinterpret_cast(opApiFuncAddr); \ + auto api_ret = \ + opApiFunc(workspace_addr, workspace_size, executor, acl_stream); \ + TORCH_CHECK(api_ret == 0, "call " #aclnn_api " failed, detail:", \ + aclGetRecentErrMsg()); \ + ReleaseConvertTypes(converted_params); \ + ReleaseHugeMem releaseMemFunc = \ + reinterpret_cast(releaseMemAddr); \ + if (releaseMemFunc) { \ + releaseMemFunc(nullptr, false); \ + } \ + return api_ret; \ + }; \ + at_npu::native::OpCommand cmd; \ + cmd.Name(#aclnn_api); \ + cmd.SetCustomHandler(acl_call); \ + cmd.Run(); \ + if (unInitMemFunc) { \ + unInitMemFunc(nullptr, false); \ + } \ + } while (false) + +#endif // __OP_API_COMMON_H__ diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h index d5b69716..aa3e82b9 100644 --- a/ads/common/ops/csrc/functions.h +++ b/ads/common/ops/csrc/functions.h @@ -5,3 +5,4 @@ void init_common(pybind11::module &m); std::tuple npu_scatter_max(const at::Tensor& updates, const at::Tensor& indices, c10::optional out); at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments); +at::Tensor abs(const at::Tensor& self); diff --git a/ads/common/ops/csrc/op_api_common.h b/ads/common/ops/csrc/op_api_common.h deleted file mode 100644 index e69de29b..00000000 diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp index 7feba1da..3bc51477 100644 --- a/ads/common/ops/csrc/pybind.cpp +++ b/ads/common/ops/csrc/pybind.cpp @@ -5,4 +5,5 @@ void init_common(pybind11::module &m) { m.def("npu_scatter_max", &npu_scatter_max); m.def("npu_scatter_max_backward", &npu_scatter_max_backward); + m.def("abs", &abs); } -- Gitee From 5b992ae08c2758d1b69f49957818c022101d7689 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Tue, 5 Dec 2023 15:04:06 +0800 Subject: [PATCH 3/6] support aclnn --- ads/common/ops/csrc/AbsOpApi.cpp | 13 +- ads/common/ops/csrc/OpApiCommon.h | 868 +++++++++++++++--------------- 2 files changed, 449 insertions(+), 432 deletions(-) diff --git a/ads/common/ops/csrc/AbsOpApi.cpp b/ads/common/ops/csrc/AbsOpApi.cpp index 9e651b44..e4187c16 100644 --- a/ads/common/ops/csrc/AbsOpApi.cpp +++ b/ads/common/ops/csrc/AbsOpApi.cpp @@ -3,11 +3,12 @@ #include "OpApiCommon" #include "functions.h" -at::Tensor abs(const at::Tensor& self) { - // construct the output tensor of the NPU - at::Tensor result = at::empty(self.sizes(), self.options()); +at::Tensor abs(const at::Tensor& self) +{ + // construct the output tensor of the NPU + at::Tensor result = at::empty(self.sizes(), self.options()); - // calculate the output result of the NPU - EXEC_NPU_CMD(aclnnAbs, self, result); - return result; + // calculate the output result of the NPU + EXEC_NPU_CMD(aclnnAbs, self, result); + return result; } diff --git a/ads/common/ops/csrc/OpApiCommon.h b/ads/common/ops/csrc/OpApiCommon.h index 81c19a60..f3d876b6 100644 --- a/ads/common/ops/csrc/OpApiCommon.h +++ b/ads/common/ops/csrc/OpApiCommon.h @@ -1,6 +1,3 @@ -#ifndef __OP_API_COMMON_H__ -#define __OP_API_COMMON_H__ - #include #include #include @@ -30,17 +27,14 @@ typedef struct aclFloatArray aclFloatArray; typedef struct aclBoolArray aclBoolArray; typedef struct aclTensorList aclTensorList; -typedef aclTensor *(*_aclCreateTensor)( - const int64_t *view_dims, uint64_t view_dims_num, aclDataType data_type, - const int64_t *stride, int64_t offset, aclFormat format, - const int64_t *storage_dims, uint64_t storage_dims_num, void *tensor_data); +typedef aclTensor *(*_aclCreateTensor)(const int64_t *view_dims, uint64_t view_dims_num, aclDataType data_type, + const int64_t *stride, int64_t offset, aclFormat format, const int64_t *storage_dims, uint64_t storage_dims_num, + void *tensor_data); typedef aclScalar *(*_aclCreateScalar)(void *value, aclDataType data_type); typedef aclIntArray *(*_aclCreateIntArray)(const int64_t *value, uint64_t size); -typedef aclFloatArray *(*_aclCreateFloatArray)(const float *value, - uint64_t size); +typedef aclFloatArray *(*_aclCreateFloatArray)(const float *value, uint64_t size); typedef aclBoolArray *(*_aclCreateBoolArray)(const bool *value, uint64_t size); -typedef aclTensorList *(*_aclCreateTensorList)(const aclTensor *const *value, - uint64_t size); +typedef aclTensorList *(*_aclCreateTensorList)(const aclTensor *const *value, uint64_t size); typedef int (*_aclDestroyTensor)(const aclTensor *tensor); typedef int (*_aclDestroyScalar)(const aclScalar *scalar); @@ -54,417 +48,447 @@ constexpr int kHashBufMaxSize = kHashBufSize + 1024; extern thread_local char g_hashBuf[kHashBufSize]; extern thread_local int g_hashOffset; -#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \ - _(at::ScalarType::Byte, ACL_UINT8) \ - _(at::ScalarType::Char, ACL_INT8) \ - _(at::ScalarType::Short, ACL_INT16) \ - _(at::ScalarType::Int, ACL_INT32) \ - _(at::ScalarType::Long, ACL_INT64) \ - _(at::ScalarType::Half, ACL_FLOAT16) \ - _(at::ScalarType::Float, ACL_FLOAT) \ - _(at::ScalarType::Double, ACL_DOUBLE) \ - _(at::ScalarType::ComplexHalf, ACL_DT_UNDEFINED) \ - _(at::ScalarType::ComplexFloat, ACL_COMPLEX64) \ - _(at::ScalarType::ComplexDouble, ACL_COMPLEX128) \ - _(at::ScalarType::Bool, ACL_BOOL) \ - _(at::ScalarType::QInt8, ACL_DT_UNDEFINED) \ - _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED) \ - _(at::ScalarType::QInt32, ACL_DT_UNDEFINED) \ - _(at::ScalarType::BFloat16, ACL_BF16) \ - _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED) \ - _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED) \ - _(at::ScalarType::Undefined, ACL_DT_UNDEFINED) \ - _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED) - -constexpr aclDataType kATenScalarTypeToAclDataTypeTable - [static_cast(at::ScalarType::NumOptions) + 1] = { +#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \ + _(at::ScalarType::Byte, ACL_UINT8) \ + _(at::ScalarType::Char, ACL_INT8) \ + _(at::ScalarType::Short, ACL_INT16) \ + _(at::ScalarType::Int, ACL_INT32) \ + _(at::ScalarType::Long, ACL_INT64) \ + _(at::ScalarType::Half, ACL_FLOAT16) \ + _(at::ScalarType::Float, ACL_FLOAT) \ + _(at::ScalarType::Double, ACL_DOUBLE) \ + _(at::ScalarType::ComplexHalf, ACL_DT_UNDEFINED) \ + _(at::ScalarType::ComplexFloat, ACL_COMPLEX64) \ + _(at::ScalarType::ComplexDouble, ACL_COMPLEX128) \ + _(at::ScalarType::Bool, ACL_BOOL) \ + _(at::ScalarType::QInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QInt32, ACL_DT_UNDEFINED) \ + _(at::ScalarType::BFloat16, ACL_BF16) \ + _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Undefined, ACL_DT_UNDEFINED) \ + _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED) + +constexpr aclDataType kATenScalarTypeToAclDataTypeTable[static_cast(at::ScalarType::NumOptions) + 1] = { #define DEFINE_ENUM(_1, n) n, - AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM) + AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM) #undef DEFINE_ENUM }; -#define GET_OP_API_FUNC(apiName) \ - reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName)) - -#define MEMCPY_TO_BUF(data_expression, size_expression) \ - if (g_hashOffset + (size_expression) > kHashBufSize) { \ - g_hashOffset = kHashBufMaxSize; \ - return; \ - } \ - memcpy(g_hashBuf + g_hashOffset, data_expression, size_expression); \ - g_hashOffset += size_expression; - -inline const char *GetOpApiLibName(void) { return "libopapi.so"; } - -inline const char *GetCustOpApiLibName(void) { return "libcust_opapi.so"; } - -inline void *GetOpApiFuncAddrInLib(void *handler, const char *libName, - const char *apiName) { - auto funcAddr = dlsym(handler, apiName); - if (funcAddr == nullptr) { - ASCEND_LOGW("dlsym %s from %s failed, error:%s.", apiName, libName, - dlerror()); - } - return funcAddr; -} - -inline void *GetOpApiLibHandler(const char *libName) { - auto handler = dlopen(libName, RTLD_LAZY); - if (handler == nullptr) { - ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror()); - } - return handler; -} - -inline void *GetOpApiFuncAddr(const char *apiName) { - static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName()); - if (custOpApiHandler != nullptr) { - auto funcAddr = - GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName); - if (funcAddr != nullptr) { - return funcAddr; +#define GET_OP_API_FUNC(apiName) reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName)) + +#define MEMCPY_TO_BUF(data_expression, size_expression) \ + if (g_hashOffset + (size_expression) > kHashBufSize) { \ + g_hashOffset = kHashBufMaxSize; \ + return; \ + } \ + memcpy(g_hashBuf + g_hashOffset, data_expression, size_expression); \ + g_hashOffset += size_expression; + +inline const char *GetOpApiLibName(void) +{ + return "libopapi.so"; +} + +inline const char *GetCustOpApiLibName(void) +{ + return "libcust_opapi.so"; +} + +inline void *GetOpApiFuncAddrInLib(void *handler, const char *libName, const char *apiName) +{ + auto funcAddr = dlsym(handler, apiName); + if (funcAddr == nullptr) { + ASCEND_LOGW("dlsym %s from %s failed, error:%s.", apiName, libName, dlerror()); } - } + return funcAddr; +} - static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName()); - if (opApiHandler == nullptr) { - return nullptr; - } - return GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName); -} - -inline c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor) { - c10::Scalar expScalar; - const at::Tensor *aclInput = &tensor; - if (aclInput->scalar_type() == at::ScalarType::Double) { - double value = *(double *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::Long) { - int64_t value = *(int64_t *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::Float) { - float value = *(float *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::Int) { - int value = *(int *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::Half) { - c10::Half value = *(c10::Half *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::Bool) { - int8_t value = *(int8_t *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::ComplexDouble) { - c10::complex value = *(c10::complex *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::ComplexFloat) { - c10::complex value = *(c10::complex *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } else if (aclInput->scalar_type() == at::ScalarType::BFloat16) { - c10::BFloat16 value = *(c10::BFloat16 *)aclInput->data_ptr(); - c10::Scalar scalar(value); - expScalar = scalar; - } - return expScalar; -} - -inline at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor) { - at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory(); - int deviceIndex = 0; - return cpuPinMemTensor.to( - c10::Device(at_npu::key::NativeDeviceType, deviceIndex), - cpuPinMemTensor.scalar_type(), true, true); -} - -inline at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar, - at::ScalarType scalar_data_type) { - return CopyTensorHostToDevice( - scalar_to_tensor(cpu_scalar).to(scalar_data_type)); -} - -inline aclTensor *ConvertType(const at::Tensor &at_tensor) { - static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); - if (aclCreateTensor == nullptr) { - return nullptr; - } +inline void *GetOpApiLibHandler(const char *libName) +{ + auto handler = dlopen(libName, RTLD_LAZY); + if (handler == nullptr) { + ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror()); + } + return handler; +} - if (!at_tensor.defined()) { - return nullptr; - } - at::ScalarType scalar_data_type = at_tensor.scalar_type(); - aclDataType acl_data_type = - kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; - TORCH_CHECK( - acl_data_type != ACL_DT_UNDEFINED, - std::string(c10::toString(scalar_data_type)) + " has not been supported") - c10::SmallVector storageDims; - // if acl_data_type is ACL_STRING, storageDims is empty. - auto itemsize = at_tensor.itemsize(); - if (itemsize == 0) { - AT_ERROR("When ConvertType, tensor item size of cannot be zero."); - return nullptr; - } - if (acl_data_type != ACL_STRING) { - storageDims.push_back(at_tensor.storage().nbytes() / itemsize); - } - - const auto dimNum = at_tensor.sizes().size(); - aclFormat format = ACL_FORMAT_ND; - switch (dimNum) { - case 3: - format = ACL_FORMAT_NCL; - break; - case 4: - format = ACL_FORMAT_NCHW; - break; - case 5: - format = ACL_FORMAT_NCDHW; - break; - default: - format = ACL_FORMAT_ND; - } - - if (at_tensor.unsafeGetTensorImpl()->is_wrapped_number()) { - c10::Scalar expScalar = ConvertTensorToScalar(at_tensor); - at::Tensor aclInput = CopyScalarToDevice(expScalar, scalar_data_type); - return aclCreateTensor( - aclInput.sizes().data(), aclInput.sizes().size(), acl_data_type, - aclInput.strides().data(), aclInput.storage_offset(), format, - storageDims.data(), storageDims.size(), const_cast(aclInput.storage().data())); - } - - auto acl_tensor = aclCreateTensor( - at_tensor.sizes().data(), at_tensor.sizes().size(), acl_data_type, - at_tensor.strides().data(), at_tensor.storage_offset(), format, - storageDims.data(), storageDims.size(), const_cast(at_tensor.storage().data())); - return acl_tensor; -} - -inline aclScalar *ConvertType(const at::Scalar &at_scalar) { - static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar); - if (aclCreateScalar == nullptr) { - return nullptr; - } - - at::ScalarType scalar_data_type = at_scalar.type(); - aclDataType acl_data_type = - kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; - TORCH_CHECK( - acl_data_type != ACL_DT_UNDEFINED, - std::string(c10::toString(scalar_data_type)) + " has not been supported") - aclScalar *acl_scalar = nullptr; - switch (scalar_data_type) { - case at::ScalarType::Double: { - double value = at_scalar.toDouble(); - acl_scalar = aclCreateScalar(&value, acl_data_type); - break; +inline void *GetOpApiFuncAddr(const char *apiName) +{ + static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName()); + if (custOpApiHandler != nullptr) { + auto funcAddr = GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName); + if (funcAddr != nullptr) { + return funcAddr; + } + } + + static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName()); + if (opApiHandler == nullptr) { + return nullptr; + } + return GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName); +} + +inline c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor) +{ + c10::Scalar expScalar; + const at::Tensor *aclInput = &tensor; + if (aclInput->scalar_type() == at::ScalarType::Double) { + double value = *(double *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Long) { + int64_t value = *(int64_t *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Float) { + float value = *(float *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Int) { + int value = *(int *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Half) { + c10::Half value = *(c10::Half *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Bool) { + int8_t value = *(int8_t *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::ComplexDouble) { + c10::complex value = *(c10::complex *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::ComplexFloat) { + c10::complex value = *(c10::complex *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::BFloat16) { + c10::BFloat16 value = *(c10::BFloat16 *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } + return expScalar; +} + +inline at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor) +{ + at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory(); + int deviceIndex = 0; + return cpuPinMemTensor.to( + c10::Device(at_npu::key::NativeDeviceType, deviceIndex), cpuPinMemTensor.scalar_type(), true, true); +} + +inline at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type) +{ + return CopyTensorHostToDevice(scalar_to_tensor(cpu_scalar).to(scalar_data_type)); +} + +inline aclTensor *ConvertType(const at::Tensor &at_tensor) +{ + static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); + if (aclCreateTensor == nullptr) { + return nullptr; } - case at::ScalarType::Long: { - int64_t value = at_scalar.toLong(); - acl_scalar = aclCreateScalar(&value, acl_data_type); - break; + + if (!at_tensor.defined()) { + return nullptr; + } + at::ScalarType scalar_data_type = at_tensor.scalar_type(); + aclDataType acl_data_type = kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; + TORCH_CHECK( + acl_data_type != ACL_DT_UNDEFINED, std::string(c10::toString(scalar_data_type)) + " has not been supported") + c10::SmallVector storageDims; + // if acl_data_type is ACL_STRING, storageDims is empty. + auto itemsize = at_tensor.itemsize(); + if (itemsize == 0) { + AT_ERROR("When ConvertType, tensor item size of cannot be zero."); + return nullptr; } - case at::ScalarType::Bool: { - bool value = at_scalar.toBool(); - acl_scalar = aclCreateScalar(&value, acl_data_type); - break; + if (acl_data_type != ACL_STRING) { + storageDims.push_back(at_tensor.storage().nbytes() / itemsize); } - case at::ScalarType::ComplexDouble: { - auto value = at_scalar.toComplexDouble(); - acl_scalar = aclCreateScalar(&value, acl_data_type); - break; + + const auto dimNum = at_tensor.sizes().size(); + aclFormat format = ACL_FORMAT_ND; + switch (dimNum) { + case 3: + format = ACL_FORMAT_NCL; + break; + case 4: + format = ACL_FORMAT_NCHW; + break; + case 5: + format = ACL_FORMAT_NCDHW; + break; + default: + format = ACL_FORMAT_ND; } - default: - acl_scalar = nullptr; - break; - } - return acl_scalar; + + if (at_tensor.unsafeGetTensorImpl()->is_wrapped_number()) { + c10::Scalar expScalar = ConvertTensorToScalar(at_tensor); + at::Tensor aclInput = CopyScalarToDevice(expScalar, scalar_data_type); + return aclCreateTensor(aclInput.sizes().data(), + aclInput.sizes().size(), + acl_data_type, + aclInput.strides().data(), + aclInput.storage_offset(), + format, + storageDims.data(), + storageDims.size(), + const_cast(aclInput.storage().data())); + } + + auto acl_tensor = aclCreateTensor(at_tensor.sizes().data(), + at_tensor.sizes().size(), + acl_data_type, + at_tensor.strides().data(), + at_tensor.storage_offset(), + format, + storageDims.data(), + storageDims.size(), + const_cast(at_tensor.storage().data())); + return acl_tensor; +} + +inline aclScalar *ConvertType(const at::Scalar &at_scalar) +{ + static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar); + if (aclCreateScalar == nullptr) { + return nullptr; + } + + at::ScalarType scalar_data_type = at_scalar.type(); + aclDataType acl_data_type = kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; + TORCH_CHECK( + acl_data_type != ACL_DT_UNDEFINED, std::string(c10::toString(scalar_data_type)) + " has not been supported") + aclScalar *acl_scalar = nullptr; + switch (scalar_data_type) { + case at::ScalarType::Double: { + double value = at_scalar.toDouble(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::Long: { + int64_t value = at_scalar.toLong(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::Bool: { + bool value = at_scalar.toBool(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::ComplexDouble: { + auto value = at_scalar.toComplexDouble(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + default: + acl_scalar = nullptr; + break; + } + return acl_scalar; } -inline aclIntArray *ConvertType(const at::IntArrayRef &at_array) { - static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray); - if (aclCreateIntArray == nullptr) { - return nullptr; - } - auto array = aclCreateIntArray(at_array.data(), at_array.size()); - return array; +inline aclIntArray *ConvertType(const at::IntArrayRef &at_array) +{ + static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray); + if (aclCreateIntArray == nullptr) { + return nullptr; + } + auto array = aclCreateIntArray(at_array.data(), at_array.size()); + return array; } template -inline aclBoolArray *ConvertType(const std::array &value) { - static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); - if (aclCreateBoolArray == nullptr) { - return nullptr; - } +inline aclBoolArray *ConvertType(const std::array &value) +{ + static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); + if (aclCreateBoolArray == nullptr) { + return nullptr; + } - auto array = aclCreateBoolArray(value.data(), value.size()); - return array; + auto array = aclCreateBoolArray(value.data(), value.size()); + return array; } -inline aclBoolArray *ConvertType(const at::ArrayRef &value) { - static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); - if (aclCreateBoolArray == nullptr) { - return nullptr; - } +inline aclBoolArray *ConvertType(const at::ArrayRef &value) +{ + static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); + if (aclCreateBoolArray == nullptr) { + return nullptr; + } - auto array = aclCreateBoolArray(value.data(), value.size()); - return array; + auto array = aclCreateBoolArray(value.data(), value.size()); + return array; } -inline aclTensorList *ConvertType(const at::TensorList &at_tensor_list) { - static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList); - if (aclCreateTensorList == nullptr) { - return nullptr; - } +inline aclTensorList *ConvertType(const at::TensorList &at_tensor_list) +{ + static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList); + if (aclCreateTensorList == nullptr) { + return nullptr; + } - std::vector tensor_list(at_tensor_list.size()); - for (size_t i = 0; i < at_tensor_list.size(); i++) { - tensor_list[i] = ConvertType(at_tensor_list[i]); - } - auto acl_tensor_list = - aclCreateTensorList(tensor_list.data(), tensor_list.size()); - return acl_tensor_list; + std::vector tensor_list(at_tensor_list.size()); + for (size_t i = 0; i < at_tensor_list.size(); i++) { + tensor_list[i] = ConvertType(at_tensor_list[i]); + } + auto acl_tensor_list = aclCreateTensorList(tensor_list.data(), tensor_list.size()); + return acl_tensor_list; } -inline aclTensor *ConvertType(const c10::optional &opt_tensor) { - if (opt_tensor.has_value() && opt_tensor.value().defined()) { - return ConvertType(opt_tensor.value()); - } - return nullptr; +inline aclTensor *ConvertType(const c10::optional &opt_tensor) +{ + if (opt_tensor.has_value() && opt_tensor.value().defined()) { + return ConvertType(opt_tensor.value()); + } + return nullptr; } -inline aclIntArray *ConvertType( - const c10::optional &opt_array) { - if (opt_array.has_value()) { - return ConvertType(opt_array.value()); - } - return nullptr; +inline aclIntArray *ConvertType(const c10::optional &opt_array) +{ + if (opt_array.has_value()) { + return ConvertType(opt_array.value()); + } + return nullptr; } -inline aclScalar *ConvertType(const c10::optional &opt_scalar) { - if (opt_scalar.has_value()) { - return ConvertType(opt_scalar.value()); - } - return nullptr; +inline aclScalar *ConvertType(const c10::optional &opt_scalar) +{ + if (opt_scalar.has_value()) { + return ConvertType(opt_scalar.value()); + } + return nullptr; } -inline aclDataType ConvertType(const at::ScalarType scalarType) { - return kATenScalarTypeToAclDataTypeTable[static_cast(scalarType)]; +inline aclDataType ConvertType(const at::ScalarType scalarType) +{ + return kATenScalarTypeToAclDataTypeTable[static_cast(scalarType)]; } template -T ConvertType(T value) { - return value; +T ConvertType(T value) +{ + return value; } template -auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr, - std::index_sequence) { - typedef int (*OpApiFunc)( - typename std::decay(params))>::type...); - auto func = reinterpret_cast(opApiAddr); - return func; +auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr, std::index_sequence) +{ + typedef int (*OpApiFunc)(typename std::decay(params))>::type...); + auto func = reinterpret_cast(opApiAddr); + return func; } template -auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr) { - static constexpr auto size = std::tuple_size::value; - return ConvertToOpApiFunc(params, opApiAddr, - std::make_index_sequence{}); +auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr) +{ + static constexpr auto size = std::tuple_size::value; + return ConvertToOpApiFunc(params, opApiAddr, std::make_index_sequence{}); } -inline void Release(aclTensor *p) { - static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor); - if (aclDestroyTensor == nullptr) { - return; - } - aclDestroyTensor(p); +inline void Release(aclTensor *p) +{ + static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor); + if (aclDestroyTensor == nullptr) { + return; + } + aclDestroyTensor(p); } -inline void Release(aclScalar *p) { - static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar); - if (aclDestroyScalar == nullptr) { - return; - } - aclDestroyScalar(p); +inline void Release(aclScalar *p) +{ + static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar); + if (aclDestroyScalar == nullptr) { + return; + } + aclDestroyScalar(p); } -inline void Release(aclIntArray *p) { - static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray); - if (aclDestroyIntArray == nullptr) { - return; - } +inline void Release(aclIntArray *p) +{ + static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray); + if (aclDestroyIntArray == nullptr) { + return; + } - aclDestroyIntArray(p); + aclDestroyIntArray(p); } -inline void Release(aclBoolArray *p) { - static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray); - if (aclDestroyBoolArray == nullptr) { - return; - } +inline void Release(aclBoolArray *p) +{ + static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray); + if (aclDestroyBoolArray == nullptr) { + return; + } - aclDestroyBoolArray(p); + aclDestroyBoolArray(p); } -inline void Release(aclTensorList *p) { - static const auto aclDestroyTensorList = - GET_OP_API_FUNC(aclDestroyTensorList); - if (aclDestroyTensorList == nullptr) { - return; - } +inline void Release(aclTensorList *p) +{ + static const auto aclDestroyTensorList = GET_OP_API_FUNC(aclDestroyTensorList); + if (aclDestroyTensorList == nullptr) { + return; + } - aclDestroyTensorList(p); + aclDestroyTensorList(p); } template -void Release(T value) { - (void)value; +void Release(T value) +{ + (void)value; } template -void CallRelease(Tuple t, std::index_sequence) { - (void)std::initializer_list{(Release(std::get(t)), 0)...}; +void CallRelease(Tuple t, std::index_sequence) +{ + (void)std::initializer_list{(Release(std::get(t)), 0)...}; } template -void ReleaseConvertTypes(Tuple &t) { - static constexpr auto size = std::tuple_size::value; - CallRelease(t, std::make_index_sequence{}); +void ReleaseConvertTypes(Tuple &t) +{ + static constexpr auto size = std::tuple_size::value; + CallRelease(t, std::make_index_sequence{}); } template -constexpr auto ConvertTypes(Ts &... args) { - return std::make_tuple(ConvertType(args)...); +constexpr auto ConvertTypes(Ts &...args) +{ + return std::make_tuple(ConvertType(args)...); } template -auto call(Function f, Tuple t, std::index_sequence) { - return f(std::get(t)...); +auto call(Function f, Tuple t, std::index_sequence) +{ + return f(std::get(t)...); } template -auto call(Function f, Tuple t) { - static constexpr auto size = std::tuple_size::value; - return call(f, t, std::make_index_sequence{}); +auto call(Function f, Tuple t) +{ + static constexpr auto size = std::tuple_size::value; + return call(f, t, std::make_index_sequence{}); } template -void AddParamToBuf(const std::array &value) { - MEMCPY_TO_BUF(value.data(), value.size() * sizeof(bool)); +void AddParamToBuf(const std::array &value) +{ + MEMCPY_TO_BUF(value.data(), value.size() * sizeof(bool)); } template -void AddParamToBuf(const T &value) { - MEMCPY_TO_BUF(&value, sizeof(T)); +void AddParamToBuf(const T &value) +{ + MEMCPY_TO_BUF(&value, sizeof(T)); } void AddParamToBuf(const at::Tensor &); @@ -480,9 +504,10 @@ void AddParamToBuf(const string &); void AddParamToBuf(); template -void AddParamToBuf(const T &arg, Args &... args) { - AddParamToBuf(arg); - AddParamToBuf(args...); +void AddParamToBuf(const T &arg, Args &...args) +{ + AddParamToBuf(arg); + AddParamToBuf(args...); } uint64_t CalcHashId(); @@ -490,83 +515,74 @@ typedef int (*InitHugeMemThreadLocal)(void *, bool); typedef void (*UnInitHugeMemThreadLocal)(void *, bool); typedef void (*ReleaseHugeMem)(void *, bool); - -#define DO_COMPATIBILITY(aclnn_api, originCallExpression) \ - do { \ - static const auto getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ - static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ - if (getWorkspaceSizeFuncAddr == nullptr || opApiFuncAddr == nullptr) { \ - ASCEND_LOGW("%s or %sGetWorkspaceSize not in %s, or %s not found. Will call %s", #aclnn_api, #aclnn_api, \ - GetOpApiLibName(), GetOpApiLibName(), #originCallExpression); \ - return originCallExpression; \ - } \ +#define DO_COMPATIBILITY(aclnn_api, originCallExpression) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ + static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ + if (getWorkspaceSizeFuncAddr == nullptr || opApiFuncAddr == nullptr) { \ + ASCEND_LOGW("%s or %sGetWorkspaceSize not in %s, or %s not found. Will call %s", \ + #aclnn_api, \ + #aclnn_api, \ + GetOpApiLibName(), \ + GetOpApiLibName(), \ + #originCallExpression); \ + return originCallExpression; \ + } \ } while (0) -#define EXEC_NPU_CMD(aclnn_api, ...) \ - do { \ - static const auto getWorkspaceSizeFuncAddr = \ - GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ - static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ - static const auto initMemAddr = \ - GetOpApiFuncAddr("InitHugeMemThreadLocal"); \ - static const auto unInitMemAddr = \ - GetOpApiFuncAddr("UnInitHugeMemThreadLocal"); \ - static const auto releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem"); \ - TORCH_CHECK( \ - getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr, \ - #aclnn_api, " or ", #aclnn_api "GetWorkspaceSize", " not in ", \ - GetOpApiLibName(), ", or ", GetOpApiLibName(), "not found."); \ - auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ - uint64_t workspace_size = 0; \ - uint64_t *workspace_size_addr = &workspace_size; \ - aclOpExecutor *executor = nullptr; \ - aclOpExecutor **executor_addr = &executor; \ - InitHugeMemThreadLocal initMemFunc = \ - reinterpret_cast(initMemAddr); \ - UnInitHugeMemThreadLocal unInitMemFunc = \ - reinterpret_cast(unInitMemAddr); \ - if (initMemFunc) { \ - initMemFunc(nullptr, false); \ - } \ - auto converted_params = \ - ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr); \ - static auto getWorkspaceSizeFunc = \ - ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ - auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ - TORCH_CHECK(workspace_status == 0, \ - "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \ - void *workspace_addr = nullptr; \ - if (workspace_size != 0) { \ - at::TensorOptions options = \ - at::TensorOptions(torch_npu::utils::get_npu_device_type()); \ - auto workspace_tensor = \ - at::empty({workspace_size}, options.dtype(at::kByte)); \ - workspace_addr = const_cast(workspace_tensor.storage().data()); \ - } \ - auto acl_call = [converted_params, workspace_addr, workspace_size, \ - acl_stream, executor]() -> int { \ - typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *, \ - const aclrtStream); \ - OpApiFunc opApiFunc = reinterpret_cast(opApiFuncAddr); \ - auto api_ret = \ - opApiFunc(workspace_addr, workspace_size, executor, acl_stream); \ - TORCH_CHECK(api_ret == 0, "call " #aclnn_api " failed, detail:", \ - aclGetRecentErrMsg()); \ - ReleaseConvertTypes(converted_params); \ - ReleaseHugeMem releaseMemFunc = \ - reinterpret_cast(releaseMemAddr); \ - if (releaseMemFunc) { \ - releaseMemFunc(nullptr, false); \ - } \ - return api_ret; \ - }; \ - at_npu::native::OpCommand cmd; \ - cmd.Name(#aclnn_api); \ - cmd.SetCustomHandler(acl_call); \ - cmd.Run(); \ - if (unInitMemFunc) { \ - unInitMemFunc(nullptr, false); \ - } \ - } while (false) - -#endif // __OP_API_COMMON_H__ +#define EXEC_NPU_CMD(aclnn_api, ...) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ + static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ + static const auto initMemAddr = GetOpApiFuncAddr("InitHugeMemThreadLocal"); \ + static const auto unInitMemAddr = GetOpApiFuncAddr("UnInitHugeMemThreadLocal"); \ + static const auto releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem"); \ + TORCH_CHECK(getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr, \ + #aclnn_api, \ + " or ", \ + #aclnn_api "GetWorkspaceSize", \ + " not in ", \ + GetOpApiLibName(), \ + ", or ", \ + GetOpApiLibName(), \ + "not found."); \ + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ + uint64_t workspace_size = 0; \ + uint64_t *workspace_size_addr = &workspace_size; \ + aclOpExecutor *executor = nullptr; \ + aclOpExecutor **executor_addr = &executor; \ + InitHugeMemThreadLocal initMemFunc = reinterpret_cast(initMemAddr); \ + UnInitHugeMemThreadLocal unInitMemFunc = reinterpret_cast(unInitMemAddr); \ + if (initMemFunc) { \ + initMemFunc(nullptr, false); \ + } \ + auto converted_params = ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr); \ + static auto getWorkspaceSizeFunc = ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ + auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ + TORCH_CHECK(workspace_status == 0, "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \ + void *workspace_addr = nullptr; \ + if (workspace_size != 0) { \ + at::TensorOptions options = at::TensorOptions(torch_npu::utils::get_npu_device_type()); \ + auto workspace_tensor = at::empty({workspace_size}, options.dtype(at::kByte)); \ + workspace_addr = const_cast(workspace_tensor.storage().data()); \ + } \ + auto acl_call = [converted_params, workspace_addr, workspace_size, acl_stream, executor]() -> int { \ + typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *, const aclrtStream); \ + OpApiFunc opApiFunc = reinterpret_cast(opApiFuncAddr); \ + auto api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream); \ + TORCH_CHECK(api_ret == 0, "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \ + ReleaseConvertTypes(converted_params); \ + ReleaseHugeMem releaseMemFunc = reinterpret_cast(releaseMemAddr); \ + if (releaseMemFunc) { \ + releaseMemFunc(nullptr, false); \ + } \ + return api_ret; \ + }; \ + at_npu::native::OpCommand cmd; \ + cmd.Name(#aclnn_api); \ + cmd.SetCustomHandler(acl_call); \ + cmd.Run(); \ + if (unInitMemFunc) { \ + unInitMemFunc(nullptr, false); \ + } \ + } while (false) -- Gitee From c2c57ba44dd8decbf631e00dc7c7e09ebbef0535 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Tue, 5 Dec 2023 15:18:58 +0800 Subject: [PATCH 4/6] support aclnn --- ads/common/__init__.py | 2 +- ads/common/ops/abs.py | 2 +- tests/test_abs.py | 63 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 tests/test_abs.py diff --git a/ads/common/__init__.py b/ads/common/__init__.py index 60441e1e..b4b8d293 100644 --- a/ads/common/__init__.py +++ b/ads/common/__init__.py @@ -1,2 +1,2 @@ from .ops.scatter_max import scatter_max -from .ops.abs import abs \ No newline at end of file +from .ops.abs import abs diff --git a/ads/common/ops/abs.py b/ads/common/ops/abs.py index d1fbd785..4b41e808 100644 --- a/ads/common/ops/abs.py +++ b/ads/common/ops/abs.py @@ -3,4 +3,4 @@ import torch import torch_npu import ads_c -abs = ads_c.abs \ No newline at end of file +abs = ads_c.abs diff --git a/tests/test_abs.py b/tests/test_abs.py new file mode 100644 index 00000000..583a5e35 --- /dev/null +++ b/tests/test_abs.py @@ -0,0 +1,63 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import numpy as np +import torch_npu + +from torch_npu.testing.testcase import TestCase, run_tests +from torch_npu.testing.common_utils import create_common_tensor +import ads.common + + +class TestAbs(TestCase): + def cpu_op_exec(self, input1): + output = torch.abs(input1) + output = output.numpy() + return output + + def npu_op_exec(self, input1): + output = ads.common.abs(input1) + output = output.to("cpu") + output = output.numpy() + return output + + def test_abs_shape_format_fp16(self, device="npu"): + format_list = [0, 3] + shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, -10, 10) + cpu_input = cpu_input.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_abs_shape_format_fp32(self, device="npu"): + format_list = [0, 3] + shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, -10, 10) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + self.assertRtolEqual(cpu_output, npu_output) + + +if __name__ == "__main__": + run_tests() -- Gitee From f88019334528bca6f91f7ed09541ce9b365124be Mon Sep 17 00:00:00 2001 From: zhanhao Date: Tue, 5 Dec 2023 15:31:26 +0800 Subject: [PATCH 5/6] support aclnn --- tests/test_abs.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/test_abs.py b/tests/test_abs.py index 583a5e35..d06641c4 100644 --- a/tests/test_abs.py +++ b/tests/test_abs.py @@ -34,7 +34,7 @@ class TestAbs(TestCase): def test_abs_shape_format_fp16(self, device="npu"): format_list = [0, 3] - shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]] + shape_list = [[5]] shape_format = [ [np.float16, i, j] for i in format_list for j in shape_list ] @@ -46,18 +46,6 @@ class TestAbs(TestCase): cpu_output = cpu_output.astype(np.float16) self.assertRtolEqual(cpu_output, npu_output) - def test_abs_shape_format_fp32(self, device="npu"): - format_list = [0, 3] - shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]] - shape_format = [ - [np.float32, i, j] for i in format_list for j in shape_list - ] - for item in shape_format: - cpu_input, npu_input = create_common_tensor(item, -10, 10) - cpu_output = self.cpu_op_exec(cpu_input) - npu_output = self.npu_op_exec(npu_input) - self.assertRtolEqual(cpu_output, npu_output) - if __name__ == "__main__": run_tests() -- Gitee From 9f0cb5474d60d555bf670c5b4ea1aeb01018ec4a Mon Sep 17 00:00:00 2001 From: zhanhao Date: Tue, 5 Dec 2023 20:00:54 +0800 Subject: [PATCH 6/6] support aclnn --- ads/common/ops/csrc/OpApiCommon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ads/common/ops/csrc/OpApiCommon.h b/ads/common/ops/csrc/OpApiCommon.h index f3d876b6..717543ac 100644 --- a/ads/common/ops/csrc/OpApiCommon.h +++ b/ads/common/ops/csrc/OpApiCommon.h @@ -566,7 +566,7 @@ typedef void (*ReleaseHugeMem)(void *, bool); auto workspace_tensor = at::empty({workspace_size}, options.dtype(at::kByte)); \ workspace_addr = const_cast(workspace_tensor.storage().data()); \ } \ - auto acl_call = [converted_params, workspace_addr, workspace_size, acl_stream, executor]() -> int { \ + auto acl_call = [converted_params, workspace_addr, workspace_size, acl_stream, executor]()->int { \ typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *, const aclrtStream); \ OpApiFunc opApiFunc = reinterpret_cast(opApiFuncAddr); \ auto api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream); \ -- Gitee