From bfc84a28fe0ec83d03790f8580bf106245b84316 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 15:29:00 +0800 Subject: [PATCH 1/8] support aclnn --- ads/common/ops/csrc/AbsOpApi.cpp | 13 + ads/common/ops/csrc/OpApiCommon.h | 588 ++++++++++++++++++++++++++++++ setup.py | 5 + 3 files changed, 606 insertions(+) create mode 100644 ads/common/ops/csrc/AbsOpApi.cpp create mode 100644 ads/common/ops/csrc/OpApiCommon.h diff --git a/ads/common/ops/csrc/AbsOpApi.cpp b/ads/common/ops/csrc/AbsOpApi.cpp new file mode 100644 index 00000000..71c1e8d7 --- /dev/null +++ b/ads/common/ops/csrc/AbsOpApi.cpp @@ -0,0 +1,13 @@ +#include +#include "OpApiCommon.h" +#include "functions.h" + +at::Tensor abs(const at::Tensor& self) +{ + // construct the output tensor of the NPU + at::Tensor result = at::empty(self.sizes(), self.options()); + + // calculate the output result of the NPU + EXEC_NPU_CMD(aclnnAbs, self, result); + return result; +} \ No newline at end of file diff --git a/ads/common/ops/csrc/OpApiCommon.h b/ads/common/ops/csrc/OpApiCommon.h new file mode 100644 index 00000000..b93a97a4 --- /dev/null +++ b/ads/common/ops/csrc/OpApiCommon.h @@ -0,0 +1,588 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "torch_npu/csrc/framework/OpCommand.h" +#include "torch_npu/csrc/framework/interface/EnvVariables.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" + +#define NPU_NAME_SPACE at_npu::native + +typedef struct aclOpExecutor aclOpExecutor; +typedef struct aclTensor aclTensor; +typedef struct aclScalar aclScalar; +typedef struct aclIntArray aclIntArray; +typedef struct aclFloatArray aclFloatArray; +typedef struct aclBoolArray aclBoolArray; +typedef struct aclTensorList aclTensorList; + +typedef aclTensor *(*_aclCreateTensor)(const int64_t *view_dims, uint64_t view_dims_num, aclDataType data_type, + const int64_t *stride, int64_t offset, aclFormat format, const int64_t *storage_dims, uint64_t storage_dims_num, + void *tensor_data); +typedef aclScalar *(*_aclCreateScalar)(void *value, aclDataType data_type); +typedef aclIntArray *(*_aclCreateIntArray)(const int64_t *value, uint64_t size); +typedef aclFloatArray *(*_aclCreateFloatArray)(const float *value, uint64_t size); +typedef aclBoolArray *(*_aclCreateBoolArray)(const bool *value, uint64_t size); +typedef aclTensorList *(*_aclCreateTensorList)(const aclTensor *const *value, uint64_t size); + +typedef int (*_aclDestroyTensor)(const aclTensor *tensor); +typedef int (*_aclDestroyScalar)(const aclScalar *scalar); +typedef int (*_aclDestroyIntArray)(const aclIntArray *array); +typedef int (*_aclDestroyFloatArray)(const aclFloatArray *array); +typedef int (*_aclDestroyBoolArray)(const aclBoolArray *array); +typedef int (*_aclDestroyTensorList)(const aclTensorList *array); + +constexpr int kHashBufSize = 8192; +constexpr int kHashBufMaxSize = kHashBufSize + 1024; +extern thread_local char g_hashBuf[kHashBufSize]; +extern thread_local int g_hashOffset; + +#define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \ + _(at::ScalarType::Byte, ACL_UINT8) \ + _(at::ScalarType::Char, ACL_INT8) \ + _(at::ScalarType::Short, ACL_INT16) \ + _(at::ScalarType::Int, ACL_INT32) \ + _(at::ScalarType::Long, ACL_INT64) \ + _(at::ScalarType::Half, ACL_FLOAT16) \ + _(at::ScalarType::Float, ACL_FLOAT) \ + _(at::ScalarType::Double, ACL_DOUBLE) \ + _(at::ScalarType::ComplexHalf, ACL_DT_UNDEFINED) \ + _(at::ScalarType::ComplexFloat, ACL_COMPLEX64) \ + _(at::ScalarType::ComplexDouble, ACL_COMPLEX128) \ + _(at::ScalarType::Bool, ACL_BOOL) \ + _(at::ScalarType::QInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QInt32, ACL_DT_UNDEFINED) \ + _(at::ScalarType::BFloat16, ACL_BF16) \ + _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED) \ + _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED) \ + _(at::ScalarType::Undefined, ACL_DT_UNDEFINED) \ + _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED) + +constexpr aclDataType kATenScalarTypeToAclDataTypeTable[static_cast(at::ScalarType::NumOptions) + 1] = { +#define DEFINE_ENUM(_1, n) n, + AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM) +#undef DEFINE_ENUM +}; + +#define GET_OP_API_FUNC(apiName) reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName)) + +#define MEMCPY_TO_BUF(data_expression, size_expression) \ + if (g_hashOffset + (size_expression) > kHashBufSize) { \ + g_hashOffset = kHashBufMaxSize; \ + return; \ + } \ + memcpy(g_hashBuf + g_hashOffset, data_expression, size_expression); \ + g_hashOffset += size_expression; + +inline const char *GetOpApiLibName(void) +{ + return "libopapi.so"; +} + +inline const char *GetCustOpApiLibName(void) +{ + return "libcust_opapi.so"; +} + +inline void *GetOpApiFuncAddrInLib(void *handler, const char *libName, const char *apiName) +{ + auto funcAddr = dlsym(handler, apiName); + if (funcAddr == nullptr) { + ASCEND_LOGW("dlsym %s from %s failed, error:%s.", apiName, libName, dlerror()); + } + return funcAddr; +} + +inline void *GetOpApiLibHandler(const char *libName) +{ + auto handler = dlopen(libName, RTLD_LAZY); + if (handler == nullptr) { + ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror()); + } + return handler; +} + +inline void *GetOpApiFuncAddr(const char *apiName) +{ + static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName()); + if (custOpApiHandler != nullptr) { + auto funcAddr = GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName); + if (funcAddr != nullptr) { + return funcAddr; + } + } + + static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName()); + if (opApiHandler == nullptr) { + return nullptr; + } + return GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName); +} + +inline c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor) +{ + c10::Scalar expScalar; + const at::Tensor *aclInput = &tensor; + if (aclInput->scalar_type() == at::ScalarType::Double) { + double value = *(double *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Long) { + int64_t value = *(int64_t *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Float) { + float value = *(float *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Int) { + int value = *(int *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Half) { + c10::Half value = *(c10::Half *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::Bool) { + int8_t value = *(int8_t *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::ComplexDouble) { + c10::complex value = *(c10::complex *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::ComplexFloat) { + c10::complex value = *(c10::complex *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } else if (aclInput->scalar_type() == at::ScalarType::BFloat16) { + c10::BFloat16 value = *(c10::BFloat16 *)aclInput->data_ptr(); + c10::Scalar scalar(value); + expScalar = scalar; + } + return expScalar; +} + +inline at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor) +{ + at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory(); + int deviceIndex = 0; + return cpuPinMemTensor.to( + c10::Device(at_npu::key::NativeDeviceType, deviceIndex), cpuPinMemTensor.scalar_type(), true, true); +} + +inline at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type) +{ + return CopyTensorHostToDevice(scalar_to_tensor(cpu_scalar).to(scalar_data_type)); +} + +inline aclTensor *ConvertType(const at::Tensor &at_tensor) +{ + static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); + if (aclCreateTensor == nullptr) { + return nullptr; + } + + if (!at_tensor.defined()) { + return nullptr; + } + at::ScalarType scalar_data_type = at_tensor.scalar_type(); + aclDataType acl_data_type = kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; + TORCH_CHECK( + acl_data_type != ACL_DT_UNDEFINED, std::string(c10::toString(scalar_data_type)) + " has not been supported") + c10::SmallVector storageDims; + // if acl_data_type is ACL_STRING, storageDims is empty. + auto itemsize = at_tensor.itemsize(); + if (itemsize == 0) { + AT_ERROR("When ConvertType, tensor item size of cannot be zero."); + return nullptr; + } + if (acl_data_type != ACL_STRING) { + storageDims.push_back(at_tensor.storage().nbytes() / itemsize); + } + + const auto dimNum = at_tensor.sizes().size(); + aclFormat format = ACL_FORMAT_ND; + switch (dimNum) { + case 3: + format = ACL_FORMAT_NCL; + break; + case 4: + format = ACL_FORMAT_NCHW; + break; + case 5: + format = ACL_FORMAT_NCDHW; + break; + default: + format = ACL_FORMAT_ND; + } + + if (at_tensor.unsafeGetTensorImpl()->is_wrapped_number()) { + c10::Scalar expScalar = ConvertTensorToScalar(at_tensor); + at::Tensor aclInput = CopyScalarToDevice(expScalar, scalar_data_type); + return aclCreateTensor(aclInput.sizes().data(), + aclInput.sizes().size(), + acl_data_type, + aclInput.strides().data(), + aclInput.storage_offset(), + format, + storageDims.data(), + storageDims.size(), + const_cast(aclInput.storage().data())); + } + + auto acl_tensor = aclCreateTensor(at_tensor.sizes().data(), + at_tensor.sizes().size(), + acl_data_type, + at_tensor.strides().data(), + at_tensor.storage_offset(), + format, + storageDims.data(), + storageDims.size(), + const_cast(at_tensor.storage().data())); + return acl_tensor; +} + +inline aclScalar *ConvertType(const at::Scalar &at_scalar) +{ + static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar); + if (aclCreateScalar == nullptr) { + return nullptr; + } + + at::ScalarType scalar_data_type = at_scalar.type(); + aclDataType acl_data_type = kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; + TORCH_CHECK( + acl_data_type != ACL_DT_UNDEFINED, std::string(c10::toString(scalar_data_type)) + " has not been supported") + aclScalar *acl_scalar = nullptr; + switch (scalar_data_type) { + case at::ScalarType::Double: { + double value = at_scalar.toDouble(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::Long: { + int64_t value = at_scalar.toLong(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::Bool: { + bool value = at_scalar.toBool(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + case at::ScalarType::ComplexDouble: { + auto value = at_scalar.toComplexDouble(); + acl_scalar = aclCreateScalar(&value, acl_data_type); + break; + } + default: + acl_scalar = nullptr; + break; + } + return acl_scalar; +} + +inline aclIntArray *ConvertType(const at::IntArrayRef &at_array) +{ + static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray); + if (aclCreateIntArray == nullptr) { + return nullptr; + } + auto array = aclCreateIntArray(at_array.data(), at_array.size()); + return array; +} + +template +inline aclBoolArray *ConvertType(const std::array &value) +{ + static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); + if (aclCreateBoolArray == nullptr) { + return nullptr; + } + + auto array = aclCreateBoolArray(value.data(), value.size()); + return array; +} + +inline aclBoolArray *ConvertType(const at::ArrayRef &value) +{ + static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); + if (aclCreateBoolArray == nullptr) { + return nullptr; + } + + auto array = aclCreateBoolArray(value.data(), value.size()); + return array; +} + +inline aclTensorList *ConvertType(const at::TensorList &at_tensor_list) +{ + static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList); + if (aclCreateTensorList == nullptr) { + return nullptr; + } + + std::vector tensor_list(at_tensor_list.size()); + for (size_t i = 0; i < at_tensor_list.size(); i++) { + tensor_list[i] = ConvertType(at_tensor_list[i]); + } + auto acl_tensor_list = aclCreateTensorList(tensor_list.data(), tensor_list.size()); + return acl_tensor_list; +} + +inline aclTensor *ConvertType(const c10::optional &opt_tensor) +{ + if (opt_tensor.has_value() && opt_tensor.value().defined()) { + return ConvertType(opt_tensor.value()); + } + return nullptr; +} + +inline aclIntArray *ConvertType(const c10::optional &opt_array) +{ + if (opt_array.has_value()) { + return ConvertType(opt_array.value()); + } + return nullptr; +} + +inline aclScalar *ConvertType(const c10::optional &opt_scalar) +{ + if (opt_scalar.has_value()) { + return ConvertType(opt_scalar.value()); + } + return nullptr; +} + +inline aclDataType ConvertType(const at::ScalarType scalarType) +{ + return kATenScalarTypeToAclDataTypeTable[static_cast(scalarType)]; +} + +template +T ConvertType(T value) +{ + return value; +} + +template +auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr, std::index_sequence) +{ + typedef int (*OpApiFunc)(typename std::decay(params))>::type...); + auto func = reinterpret_cast(opApiAddr); + return func; +} + +template +auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr) +{ + static constexpr auto size = std::tuple_size::value; + return ConvertToOpApiFunc(params, opApiAddr, std::make_index_sequence{}); +} + +inline void Release(aclTensor *p) +{ + static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor); + if (aclDestroyTensor == nullptr) { + return; + } + aclDestroyTensor(p); +} + +inline void Release(aclScalar *p) +{ + static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar); + if (aclDestroyScalar == nullptr) { + return; + } + aclDestroyScalar(p); +} + +inline void Release(aclIntArray *p) +{ + static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray); + if (aclDestroyIntArray == nullptr) { + return; + } + + aclDestroyIntArray(p); +} + +inline void Release(aclBoolArray *p) +{ + static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray); + if (aclDestroyBoolArray == nullptr) { + return; + } + + aclDestroyBoolArray(p); +} + +inline void Release(aclTensorList *p) +{ + static const auto aclDestroyTensorList = GET_OP_API_FUNC(aclDestroyTensorList); + if (aclDestroyTensorList == nullptr) { + return; + } + + aclDestroyTensorList(p); +} + +template +void Release(T value) +{ + (void)value; +} + +template +void CallRelease(Tuple t, std::index_sequence) +{ + (void)std::initializer_list{(Release(std::get(t)), 0)...}; +} + +template +void ReleaseConvertTypes(Tuple &t) +{ + static constexpr auto size = std::tuple_size::value; + CallRelease(t, std::make_index_sequence{}); +} + +template +constexpr auto ConvertTypes(Ts &...args) +{ + return std::make_tuple(ConvertType(args)...); +} + +template +auto call(Function f, Tuple t, std::index_sequence) +{ + return f(std::get(t)...); +} + +template +auto call(Function f, Tuple t) +{ + static constexpr auto size = std::tuple_size::value; + return call(f, t, std::make_index_sequence{}); +} + +template +void AddParamToBuf(const std::array &value) +{ + MEMCPY_TO_BUF(value.data(), value.size() * sizeof(bool)); +} + +template +void AddParamToBuf(const T &value) +{ + MEMCPY_TO_BUF(&value, sizeof(T)); +} + +void AddParamToBuf(const at::Tensor &); +void AddParamToBuf(const at::Scalar &); +void AddParamToBuf(const at::IntArrayRef &); +void AddParamToBuf(const at::ArrayRef &); +void AddParamToBuf(const at::TensorList &); +void AddParamToBuf(const c10::optional &); +void AddParamToBuf(const c10::optional &); +void AddParamToBuf(const c10::optional &); +void AddParamToBuf(const at::ScalarType); +void AddParamToBuf(const string &); +void AddParamToBuf(); + +template +void AddParamToBuf(const T &arg, Args &...args) +{ + AddParamToBuf(arg); + AddParamToBuf(args...); +} + +uint64_t CalcHashId(); +typedef int (*InitHugeMemThreadLocal)(void *, bool); +typedef void (*UnInitHugeMemThreadLocal)(void *, bool); +typedef void (*ReleaseHugeMem)(void *, bool); + +#define DO_COMPATIBILITY(aclnn_api, originCallExpression) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ + static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ + if (getWorkspaceSizeFuncAddr == nullptr || opApiFuncAddr == nullptr) { \ + ASCEND_LOGW("%s or %sGetWorkspaceSize not in %s, or %s not found. Will call %s", \ + #aclnn_api, \ + #aclnn_api, \ + GetOpApiLibName(), \ + GetOpApiLibName(), \ + #originCallExpression); \ + return originCallExpression; \ + } \ + } while (0) + +#define EXEC_NPU_CMD(aclnn_api, ...) \ + do { \ + static const auto getWorkspaceSizeFuncAddr = GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ + static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ + static const auto initMemAddr = GetOpApiFuncAddr("InitHugeMemThreadLocal"); \ + static const auto unInitMemAddr = GetOpApiFuncAddr("UnInitHugeMemThreadLocal"); \ + static const auto releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem"); \ + TORCH_CHECK(getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr, \ + #aclnn_api, \ + " or ", \ + #aclnn_api "GetWorkspaceSize", \ + " not in ", \ + GetOpApiLibName(), \ + ", or ", \ + GetOpApiLibName(), \ + "not found."); \ + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ + uint64_t workspace_size = 0; \ + uint64_t *workspace_size_addr = &workspace_size; \ + aclOpExecutor *executor = nullptr; \ + aclOpExecutor **executor_addr = &executor; \ + InitHugeMemThreadLocal initMemFunc = reinterpret_cast(initMemAddr); \ + UnInitHugeMemThreadLocal unInitMemFunc = reinterpret_cast(unInitMemAddr); \ + if (initMemFunc) { \ + initMemFunc(nullptr, false); \ + } \ + auto converted_params = ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr); \ + static auto getWorkspaceSizeFunc = ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ + auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ + TORCH_CHECK(workspace_status == 0, "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \ + void *workspace_addr = nullptr; \ + if (workspace_size != 0) { \ + at::TensorOptions options = at::TensorOptions(torch_npu::utils::get_npu_device_type()); \ + auto workspace_tensor = at::empty({workspace_size}, options.dtype(at::kByte)); \ + workspace_addr = const_cast(workspace_tensor.storage().data()); \ + } \ + auto acl_call = [converted_params, workspace_addr, workspace_size, acl_stream, executor]()->int { \ + typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *, const aclrtStream); \ + OpApiFunc opApiFunc = reinterpret_cast(opApiFuncAddr); \ + auto api_ret = opApiFunc(workspace_addr, workspace_size, executor, acl_stream); \ + TORCH_CHECK(api_ret == 0, "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \ + ReleaseConvertTypes(converted_params); \ + ReleaseHugeMem releaseMemFunc = reinterpret_cast(releaseMemAddr); \ + if (releaseMemFunc) { \ + releaseMemFunc(nullptr, false); \ + } \ + return api_ret; \ + }; \ + at_npu::native::OpCommand cmd; \ + cmd.Name(#aclnn_api); \ + cmd.SetCustomHandler(acl_call); \ + cmd.Run(); \ + if (unInitMemFunc) { \ + unInitMemFunc(nullptr, false); \ + } \ + } while (false) \ No newline at end of file diff --git a/setup.py b/setup.py index 2ff09279..dd652ae7 100644 --- a/setup.py +++ b/setup.py @@ -8,10 +8,15 @@ source_file = [] source_file += glob.glob(os.path.join("./ads/common/ops/csrc/", "*.cpp")) source_file += glob.glob(os.path.join("./bind/", "*.cpp")) +torch_npu_dir = extension.PYTORCH_NPU_INSTALL_PATH +include_dirs = [] +include_dirs.append(torch_npu_dir + "/include/third_party/acl/inc/") + exts = [] ext1 = extension.NpuExtension( name="ads_c", sources=source_file, + include_dirs=include_dirs, ) exts.append(ext1) -- Gitee From 310c25880ea58d63faac723817c3ccc93851a350 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 15:53:44 +0800 Subject: [PATCH 2/8] support aclnn --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index dd652ae7..81c2d51c 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ ext1 = extension.NpuExtension( name="ads_c", sources=source_file, include_dirs=include_dirs, + extra_compile_args=['-D__FILENAME__=\"$$(notdir $$(abspath $$<))\"',], ) exts.append(ext1) -- Gitee From a32f20d18ed7fd06e922ce2f9d04520ee00fb031 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 15:57:03 +0800 Subject: [PATCH 3/8] support aclnn --- ads/common/ops/csrc/AbsOpApi.cpp | 2 +- ads/common/ops/csrc/functions.h | 3 ++- ads/common/ops/csrc/pybind.cpp | 2 ++ ads/common/ops/npu_abs.py | 5 +++++ 4 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 ads/common/ops/npu_abs.py diff --git a/ads/common/ops/csrc/AbsOpApi.cpp b/ads/common/ops/csrc/AbsOpApi.cpp index 71c1e8d7..99963302 100644 --- a/ads/common/ops/csrc/AbsOpApi.cpp +++ b/ads/common/ops/csrc/AbsOpApi.cpp @@ -2,7 +2,7 @@ #include "OpApiCommon.h" #include "functions.h" -at::Tensor abs(const at::Tensor& self) +at::Tensor npu_abs(const at::Tensor& self) { // construct the output tensor of the NPU at::Tensor result = at::empty(self.sizes(), self.options()); diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h index daa82f4d..243774ab 100644 --- a/ads/common/ops/csrc/functions.h +++ b/ads/common/ops/csrc/functions.h @@ -41,4 +41,5 @@ at::Tensor npu_yolo_boxes_encode( at::Tensor npu_scatter(const at::Tensor& self, const at::Tensor& indices, const at::Tensor& updates, int64_t dim); at::Tensor npu_rotary_mul(const at::Tensor &self, const at::Tensor &r1, const at::Tensor &r2); at::Tensor npu_silu(const at::Tensor& self); -at::Tensor& npu_silu_(at::Tensor& self); \ No newline at end of file +at::Tensor& npu_silu_(at::Tensor& self); +at::Tensor npu_abs(const at::Tensor& self); diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp index c4383ac8..4eb1cf6f 100644 --- a/ads/common/ops/csrc/pybind.cpp +++ b/ads/common/ops/csrc/pybind.cpp @@ -41,4 +41,6 @@ void init_common(pybind11::module &m) // rotary mul m.def("npu_rotary_mul", &npu_rotary_mul); + + m.def("npu_abs", &npu_abs); } diff --git a/ads/common/ops/npu_abs.py b/ads/common/ops/npu_abs.py new file mode 100644 index 00000000..72eb3a0a --- /dev/null +++ b/ads/common/ops/npu_abs.py @@ -0,0 +1,5 @@ +import torch +import torch_npu +import ads_c + +npu_abs = ads_c.npu_abs \ No newline at end of file -- Gitee From cbd704b5587f3c5526d4fc444e6cc6eb871465c1 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 16:00:44 +0800 Subject: [PATCH 4/8] support aclnn --- tests/test_abs.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 tests/test_abs.py diff --git a/tests/test_abs.py b/tests/test_abs.py new file mode 100644 index 00000000..d06641c4 --- /dev/null +++ b/tests/test_abs.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import numpy as np +import torch_npu + +from torch_npu.testing.testcase import TestCase, run_tests +from torch_npu.testing.common_utils import create_common_tensor +import ads.common + + +class TestAbs(TestCase): + def cpu_op_exec(self, input1): + output = torch.abs(input1) + output = output.numpy() + return output + + def npu_op_exec(self, input1): + output = ads.common.abs(input1) + output = output.to("cpu") + output = output.numpy() + return output + + def test_abs_shape_format_fp16(self, device="npu"): + format_list = [0, 3] + shape_list = [[5]] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item, -10, 10) + cpu_input = cpu_input.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec(npu_input) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + +if __name__ == "__main__": + run_tests() -- Gitee From 525905a4a0c838da041db9048020afe7b5b98c9a Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 16:01:53 +0800 Subject: [PATCH 5/8] support aclnn --- tests/test_abs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_abs.py b/tests/test_abs.py index d06641c4..a8aa5960 100644 --- a/tests/test_abs.py +++ b/tests/test_abs.py @@ -27,7 +27,7 @@ class TestAbs(TestCase): return output def npu_op_exec(self, input1): - output = ads.common.abs(input1) + output = ads.common.npu_abs(input1) output = output.to("cpu") output = output.numpy() return output -- Gitee From 82401d7926ec99b2b3703f6ded7d3130875d2816 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 16:02:58 +0800 Subject: [PATCH 6/8] support aclnn --- ads/common/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ads/common/__init__.py b/ads/common/__init__.py index ec5a1f3a..30ad1c8c 100644 --- a/ads/common/__init__.py +++ b/ads/common/__init__.py @@ -12,4 +12,5 @@ from .ops.yolo_boxes_encode import npu_yolo_boxes_encode from .ops.scatter import npu_scatter from .ops.silu import npu_silu from .ops.silu import npu_silu_ -from .ops.rotary_mul import npu_rotary_mul \ No newline at end of file +from .ops.rotary_mul import npu_rotary_mul +from .ops.npu_abs import npu_abs \ No newline at end of file -- Gitee From 61dc82d378c9e97fba8cd1fe3408535e46afea2a Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 16:24:25 +0800 Subject: [PATCH 7/8] support aclnn --- ads/common/__init__.py | 2 +- ads/common/ops/csrc/AbsOpApi.cpp | 2 +- ads/common/ops/csrc/OpApiCommon.h | 2 +- ads/common/ops/npu_abs.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ads/common/__init__.py b/ads/common/__init__.py index 30ad1c8c..c4dcc62e 100644 --- a/ads/common/__init__.py +++ b/ads/common/__init__.py @@ -13,4 +13,4 @@ from .ops.scatter import npu_scatter from .ops.silu import npu_silu from .ops.silu import npu_silu_ from .ops.rotary_mul import npu_rotary_mul -from .ops.npu_abs import npu_abs \ No newline at end of file +from .ops.npu_abs import npu_abs diff --git a/ads/common/ops/csrc/AbsOpApi.cpp b/ads/common/ops/csrc/AbsOpApi.cpp index 99963302..824b60ce 100644 --- a/ads/common/ops/csrc/AbsOpApi.cpp +++ b/ads/common/ops/csrc/AbsOpApi.cpp @@ -10,4 +10,4 @@ at::Tensor npu_abs(const at::Tensor& self) // calculate the output result of the NPU EXEC_NPU_CMD(aclnnAbs, self, result); return result; -} \ No newline at end of file +} diff --git a/ads/common/ops/csrc/OpApiCommon.h b/ads/common/ops/csrc/OpApiCommon.h index b93a97a4..717543ac 100644 --- a/ads/common/ops/csrc/OpApiCommon.h +++ b/ads/common/ops/csrc/OpApiCommon.h @@ -585,4 +585,4 @@ typedef void (*ReleaseHugeMem)(void *, bool); if (unInitMemFunc) { \ unInitMemFunc(nullptr, false); \ } \ - } while (false) \ No newline at end of file + } while (false) diff --git a/ads/common/ops/npu_abs.py b/ads/common/ops/npu_abs.py index 72eb3a0a..62f02b84 100644 --- a/ads/common/ops/npu_abs.py +++ b/ads/common/ops/npu_abs.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_abs = ads_c.npu_abs \ No newline at end of file +npu_abs = ads_c.npu_abs -- Gitee From 04dd7155f5a7d26e895903972fa519683f74a5a1 Mon Sep 17 00:00:00 2001 From: zhanhao Date: Thu, 7 Dec 2023 18:51:16 +0800 Subject: [PATCH 8/8] support aclnn --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 81c2d51c..13484c88 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ ext1 = extension.NpuExtension( name="ads_c", sources=source_file, include_dirs=include_dirs, - extra_compile_args=['-D__FILENAME__=\"$$(notdir $$(abspath $$<))\"',], + extra_compile_args=['-D__FILENAME__=\"$$(notdir $$(abspath $$<))\"'], ) exts.append(ext1) -- Gitee