diff --git a/ads/common/ops/csrc/VecPoolBackwardKernelNpu.cpp b/ads/common/ops/csrc/VecPoolBackwardKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8166d86b74dc534e05e311f9f45feda3c107a76e --- /dev/null +++ b/ads/common/ops/csrc/VecPoolBackwardKernelNpu.cpp @@ -0,0 +1,31 @@ +// Copyright (c) 2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "csrc/OpApiCommon.h" +#include "functions.h" + +at::Tensor vec_pool_backward(const at::Tensor &grad_new_features, + const at::Tensor &point_cnt_of_grid, + const at::Tensor &grouped_idxs, + const int32_t n, + const int32_t num_c_in) +{ + auto output_size = {static_cast(n), static_cast(num_c_in)}; + at::Tensor out = at::zeros(output_size, grad_new_features.options()); + EXEC_NPU_CMD(aclnnVecPoolGrad, grad_new_features, point_cnt_of_grid, grouped_idxs, n, num_c_in, out); + return out; +} diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h index 8ca1e3094aa76ee17bb8af5d7ec18309af295bfb..3b8464e7305de29274c98f8fb0ad5c71742fa0d4 100644 --- a/ads/common/ops/csrc/functions.h +++ b/ads/common/ops/csrc/functions.h @@ -163,4 +163,10 @@ at::Tensor DynamicVoxelization( std::tuple nms3d_normal(const at::Tensor &boxes, double nms_overlap_thresh); std::tuple nms3d(const at::Tensor &boxes, double threshold); + +at::Tensor vec_pool_backward(const at::Tensor &grad_new_features, + const at::Tensor &point_cnt_of_grid, + const at::Tensor &grouped_idxs, + const int32_t n, + const int32_t num_c_in); #endif // COMMON_OPS_CSRC_FUNCTIONS_H_ diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp index 985d8fbd2feae1b298d19cd8c330acbf5611ec8a..c1aff56abb77ba33af5bc22311a0b7c7173348d6 100644 --- a/ads/common/ops/csrc/pybind.cpp +++ b/ads/common/ops/csrc/pybind.cpp @@ -100,4 +100,7 @@ void init_common(pybind11::module &m) // ads_nms3d m.def("nms3d", &nms3d); + + // vec_pool + m.def("vec_pool_backward", &vec_pool_backward); } diff --git a/ads/common/ops/kernels/op_host/vec_pool_grad.cpp b/ads/common/ops/kernels/op_host/vec_pool_grad.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e9c875b086345fb810735336268d445919b3ec17 --- /dev/null +++ b/ads/common/ops/kernels/op_host/vec_pool_grad.cpp @@ -0,0 +1,155 @@ +#include "vec_pool_grad_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +using namespace std; + +namespace optiling { +static int32_t GetCeilInt(int32_t value1, int32_t value2) +{ + if (value2 == 0) { + return value1; + } + return static_cast((value1 + value2 - 1) / value2); +} + +static ge::graphStatus VecPoolGradTilingFunc(gert::TilingContext *context) +{ + VecPoolGradTilingData tiling; + auto platformInfo = context->GetPlatformInfo(); + if (platformInfo == nullptr) { + return ge::GRAPH_FAILED; + } + auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo); + static uint32_t coreNum = ascendcPlatform.GetCoreNumAiv(); + + auto groupedIdxsShape = context->GetInputTensor(2)->GetStorageShape(); + uint32_t numMaxSumPoints = groupedIdxsShape.GetDim(0); + uint32_t formerCoreGroups = GetCeilInt(numMaxSumPoints, coreNum); //主核group数 + uint32_t usedCoreNum = GetCeilInt(numMaxSumPoints, formerCoreGroups); //占用核数 + uint32_t lastCoreGroups = numMaxSumPoints - (usedCoreNum - 1) * formerCoreGroups; //尾核group数 + if (formerCoreGroups == 0) { + return ge::GRAPH_FAILED; + } + uint32_t formerCoreData = formerCoreGroups * 3 * 4; + uint32_t lastCoreData = lastCoreGroups * 3 * 4; + + auto gradNewFeaturesShape = context->GetInputTensor(0)->GetStorageShape(); + auto pointCntOfGridShape = context->GetInputTensor(1)->GetStorageShape(); + uint32_t cOut = gradNewFeaturesShape.GetDim(1); + uint32_t numTotalGrids = pointCntOfGridShape.GetDim(1); + uint32_t numCEachGrid = cOut / numTotalGrids; + uint64_t pointCntOfGridUbSize = 32; + uint64_t gradNewFeaturesUbSize = GetCeilInt(numCEachGrid * sizeof(float), 32) * 32; + uint64_t gradSupportFeaturesUbSize = gradNewFeaturesUbSize; + uint64_t usedUbSize = pointCntOfGridUbSize + gradNewFeaturesUbSize + gradSupportFeaturesUbSize; + + uint64_t availableUbSize; + auto platformInfoptr = context->GetPlatformInfo(); + if (platformInfoptr == nullptr) { + return ge::GRAPH_FAILED; + } + auto ascendplatformInfo = platform_ascendc::PlatformAscendC(platformInfoptr); + ascendplatformInfo.GetCoreMemSize(platform_ascendc::CoreMemType::UB, availableUbSize); + availableUbSize = (availableUbSize - usedUbSize - 20 * 1024) / 96 * 96; // UB上可留给groupIdx的大小,也就是主块大小 + uint32_t formerTilingNum = GetCeilInt(formerCoreData, availableUbSize); + uint32_t mainGroups = availableUbSize / 4 / 3; + uint32_t copyTail = formerCoreData % availableUbSize; + uint32_t formerTailGroups = copyTail / 4 / 3; + uint32_t lastTilingNum = GetCeilInt(lastCoreData, availableUbSize); + uint32_t lastCopyTail = lastCoreData % availableUbSize; + uint32_t lastTailGroups = lastCopyTail / 4 / 3; + + context->SetBlockDim(usedCoreNum); + uint32_t m = gradNewFeaturesShape.GetDim(0); + auto attrs = context->GetAttrs(); + if (attrs == nullptr) { + return ge::GRAPH_FAILED; + } + uint32_t n = *(attrs->GetAttrPointer(0)); + uint32_t cIn = *(attrs->GetAttrPointer(1)); + tiling.set_formerCoreGroups(formerCoreGroups); + tiling.set_formerCoreData(formerCoreData); + tiling.set_usedCoreNum(usedCoreNum); + tiling.set_availableUbSize(availableUbSize); // UB上可留给groupIdx的大小 + tiling.set_mainGroups(mainGroups); + tiling.set_copyLoop(formerTilingNum - 1); // 主核上的tilingNum - 1 + tiling.set_copyTail(copyTail); // 主核上的尾块大小 + tiling.set_formerTailGroups(formerTailGroups); // 主核尾块Group数 + tiling.set_lastCopyLoop(lastTilingNum - 1); // 尾核上的tilingNum - 1 + tiling.set_lastCopyTail(lastCopyTail); // 尾核上的尾块大小 + tiling.set_lastTailGroups(lastTailGroups); // 尾核尾块Group数 + tiling.set_m(m); + tiling.set_cOut(cOut); + tiling.set_numTotalGrids(numTotalGrids); + tiling.set_numCEachGrid(numCEachGrid); + tiling.set_numCEachGrid(numMaxSumPoints); + tiling.set_n(n); + tiling.set_cIn(cIn); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; +} +} + +namespace ge { +static ge::graphStatus InferShape(gert::InferShapeContext* context) +{ + auto attrs = context->GetAttrs(); + if (attrs == nullptr) { + return ge::GRAPH_FAILED; + } + const int32_t* n = attrs->GetAttrPointer(0); + const int32_t* cIn = attrs->GetAttrPointer(1); + + gert::Shape* gradSupportFeaturesShape = context->GetOutputShape(0); + if (gradSupportFeaturesShape == nullptr) { + return ge::GRAPH_FAILED; + } + gradSupportFeaturesShape->AppendDim(*n); + gradSupportFeaturesShape->AppendDim(*cIn); + return GRAPH_SUCCESS; +} +} + +namespace ops { +class VecPoolGrad : public OpDef { +public: + explicit VecPoolGrad(const char* name) : OpDef(name) + { + this->Input("grad_new_features") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("point_cnt_of_grid") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("grouped_idxs") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("grad_support_features") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Attr("n").Int(); + this->Attr("num_c_in").Int(); + + this->SetInferShape(ge::InferShape); + + this->AICore() + .SetTiling(optiling::VecPoolGradTilingFunc); + this->AICore().AddConfig("ascend910b"); + } +}; + +OP_ADD(VecPoolGrad); +} diff --git a/ads/common/ops/kernels/op_host/vec_pool_grad_tiling.h b/ads/common/ops/kernels/op_host/vec_pool_grad_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..d95e61a669161b4f54a3a850f9577b5b934057fb --- /dev/null +++ b/ads/common/ops/kernels/op_host/vec_pool_grad_tiling.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#ifndef VEC_POOL_GRAD_TILING_H +#define VEC_POOL_GRAD_TILING_H + +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(VecPoolGradTilingData) + TILING_DATA_FIELD_DEF(uint32_t, formerCoreGroups) + TILING_DATA_FIELD_DEF(uint32_t, formerCoreData) + TILING_DATA_FIELD_DEF(uint32_t, usedCoreNum) + TILING_DATA_FIELD_DEF(uint32_t, availableUbSize) + TILING_DATA_FIELD_DEF(uint32_t, mainGroups) + TILING_DATA_FIELD_DEF(uint32_t, copyLoop) + TILING_DATA_FIELD_DEF(uint32_t, copyTail) + TILING_DATA_FIELD_DEF(uint32_t, formerTailGroups) + TILING_DATA_FIELD_DEF(uint32_t, lastCopyLoop) + TILING_DATA_FIELD_DEF(uint32_t, lastCopyTail) + TILING_DATA_FIELD_DEF(uint32_t, lastTailGroups) + TILING_DATA_FIELD_DEF(uint32_t, m) + TILING_DATA_FIELD_DEF(uint32_t, cOut) + TILING_DATA_FIELD_DEF(uint32_t, numTotalGrids) + TILING_DATA_FIELD_DEF(uint32_t, numCEachGrid) + TILING_DATA_FIELD_DEF(uint32_t, numMaxSumPoints) + TILING_DATA_FIELD_DEF(uint32_t, n) + TILING_DATA_FIELD_DEF(uint32_t, cIn) +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(VecPoolGrad, VecPoolGradTilingData) +} // namespace optiling + +#endif // VEC_POOL_GRAD_TILING_H diff --git a/ads/common/ops/kernels/op_kernel/vec_pool_grad.cpp b/ads/common/ops/kernels/op_kernel/vec_pool_grad.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a4ca3dddcbc0daaed441c1555d51b9d1a656eb19 --- /dev/null +++ b/ads/common/ops/kernels/op_kernel/vec_pool_grad.cpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. + * + */ + +#include "kernel_operator.h" +using namespace AscendC; + +constexpr int32_t M = 10; +constexpr int32_t NUM_TOTAL_GRIDS = 3; +constexpr int32_t C_OUT = 26; +constexpr int32_t N = 15; +constexpr int32_t C_IN= 31; +constexpr int32_t NUM_MAX_SUM_POINTS= 24; +constexpr int32_t NUM_C_EACH_GRID = C_OUT / NUM_TOTAL_GRIDS; + +constexpr int32_t TOTAL_LENGTH = NUM_MAX_SUM_POINTS * 3; +constexpr int32_t USE_CORE_NUM = 1; +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; +constexpr int32_t TILE_NUM = 1; +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; + + +class KernelVecPoolGrad { +public: + __aicore__ inline KernelVecPoolGrad() {} + + __aicore__ inline void Init(GM_ADDR grad_new_features, GM_ADDR point_cnt_of_grid, + GM_ADDR grouped_idxs, GM_ADDR grad_support_features, + VecPoolGradTilingData *tiling_data) + { + ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + this->formerCoreGroups = tiling_data->formerCoreGroups; //主核上的group数 + this->formerCoreData = tiling_data->formerCoreData; //主核上的数据量/byte + this->usedCoreNum = tiling_data->usedCoreNum; //使用核数 + this->availableUbSize = tiling_data->availableUbSize; //UB上留给grouped_idxs的空间/byte + this->mainGroups = tiling_data->mainGroups; //主块上的group数 + this->copyLoop = tiling_data->copyLoop; // 主核主块个数(循环次数) + this->copyTail = tiling_data->copyTail; // 主核尾块上的数据量/byte + this->formerTailGroups = tiling_data->formerTailGroups; // 主核尾块上的group数 + this->lastCopyLoop = tiling_data->lastCopyLoop; // 尾核主块个数(循环次数) + this->lastCopyTail = tiling_data->lastCopyTail; // 尾核尾块上的数据量/byte + this->lastTailGroups = tiling_data->lastTailGroups; // 尾核尾块上的group数 + this->m = tiling_data->m; + this->cOut = tiling_data->cOut; + this->numTotalGrids = tiling_data->numTotalGrids; + this->numCEachGrid = tiling_data->numCEachGrid; + this->numMaxSumPoints = tiling_data->numMaxSumPoints; + this->n = tiling_data->n; + this->cIn = tiling_data->cIn; + + gradNewFeaturesGM.SetGlobalBuffer((__gm__ float*)grad_new_features, this->m * this->cOut); + pointCntOfGridGM.SetGlobalBuffer((__gm__ int32_t*)point_cnt_of_grid, this->m * this->numTotalGrids); + groupedIdxsGM.SetGlobalBuffer((__gm__ int32_t*)grouped_idxs, this->numMaxSumPoints * 3); + gradSupportFeaturesGM.SetGlobalBuffer((__gm__ float*)grad_support_features, this->n * this->cIn); + + pipe.InitBuffer(inQueueGroupedIdxs, BUFFER_NUM, availableUbSize); + pipe.InitBuffer(inQueuePointCntOfGrid, BUFFER_NUM, 32); + pipe.InitBuffer(inQueueGradNewFeatures, BUFFER_NUM, this->numCEachGrid * sizeof(float)); + pipe.InitBuffer(outQueueGradSupportFeatures, BUFFER_NUM, this->numCEachGrid * sizeof(float)); + } + + __aicore__ inline void Process() + { + uint32_t coreId = GetBlockIdx(); + if (coreId > this->usedCoreNum) { + return; + } + if (coreId != (this->usedCoreNum -1)) { //主核 + for (int32_t i = 0; i < this->copyLoop; i++) { //主块 + //需要算出主核主块、主核尾块、尾核主块、尾核尾块对应的group数 + CopyIn(i, this->mainGroups); + Compute(this->mainGroups); + } + if (this->copyTail != 0) { //尾块 + CopyIn(this->copyLoop, this->formerTailGroups); + Compute(this->formerTailGroups); + } + } else { //尾核 + for (int32_t i = 0; i < this->lastCopyLoop; i++) { //主块 + CopyIn(i, this->mainGroups); + Compute(this->mainGroups); + } + if (this->lastCopyTail != 0) { //尾块 + CopyIn(this->lastCopyLoop, this->lastTailGroups); + Compute(this->lastTailGroups); + } + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress, uint32_t numGroups) + { + LocalTensor groupedIdxsLocal = inQueueGroupedIdxs.AllocTensor(); + DataCopyExtParams groupedIdxsCopyParams{1, static_cast(numGroups * 3 * sizeof(int32_t)), 0, 0, 0}; + DataCopyPadExtParams groupedIdxsPadParams{false, 0, 0, 0}; + DataCopyPad(groupedIdxsLocal, groupedIdxsGM[GetBlockIdx() * this->formerCoreData + progress * numGroups * 3], groupedIdxsCopyParams, groupedIdxsPadParams); + inQueueGroupedIdxs.EnQue(groupedIdxsLocal); + } + + __aicore__ inline void Compute(uint32_t numGroups) + { + LocalTensor groupedIdxsLocal = inQueueGroupedIdxs.DeQue(); + LocalTensor numTotalPtsLocal = inQueuePointCntOfGrid.AllocTensor(); + LocalTensor gradNewFeaturesLocal = inQueueGradNewFeatures.AllocTensor(); + LocalTensor gradSupportFeaturesLocal = outQueueGradSupportFeatures.AllocTensor(); + event_t eventIDVToMTE3 = static_cast(GetTPipePtr()->AllocEventID()); + for (int32_t i = 0; i < numGroups; i++) { + int32_t idxOfSupportXyz = groupedIdxsLocal.GetValue(i * 3); + int32_t idxOfNewXyz = groupedIdxsLocal.GetValue(i * 3 + 1); + int32_t idxOfGridIdx = groupedIdxsLocal.GetValue(i * 3 + 2); + + DataCopyExtParams pointCntOfGridCopyParams{1, sizeof(int32_t), 0, 0, 0}; + DataCopyPadExtParams pointCntOfGridPadParams{false, 0, 0, 0}; + DataCopyPad(numTotalPtsLocal, pointCntOfGridGM[idxOfNewXyz * this->numTotalGrids + idxOfGridIdx], pointCntOfGridCopyParams, pointCntOfGridPadParams); + int32_t num_total_pts = numTotalPtsLocal.GetValue(0); + + float cur_grad = 1 / max(static_cast(num_total_pts), 1.f); + + DataCopyExtParams gradNewFeaturesCopyParams{1, static_cast(this->numCEachGrid * sizeof(float)), 0, 0, 0}; + DataCopyPadExtParams gradNewFeaturesPadParams{false, 0, 0, 0}; + DataCopyPad(gradNewFeaturesLocal, gradNewFeaturesGM[idxOfNewXyz * this->cOut + idxOfGridIdx * this->numCEachGrid], gradNewFeaturesCopyParams, gradNewFeaturesPadParams); + + Muls(gradSupportFeaturesLocal, gradNewFeaturesLocal, cur_grad, this->numCEachGrid); + + SetFlag(eventIDVToMTE3); + WaitFlag(eventIDVToMTE3); + int32_t repeatTimes = this->cIn / this->numCEachGrid; + for (int32_t j = 0; j < repeatTimes; j++) { + SetAtomicAdd(); + DataCopyExtParams copyParams{1, static_cast(this->numCEachGrid * sizeof(float)), 0, 0, 0}; + DataCopyPad(gradSupportFeaturesGM[idxOfSupportXyz * this->cIn + j * this->numCEachGrid], gradSupportFeaturesLocal, copyParams); + SetAtomicNone(); + } + int32_t tail = this->cIn % this->numCEachGrid; + if (tail != 0) { + SetAtomicAdd(); + DataCopyExtParams copyTailParams{1, static_cast(tail * sizeof(float)), 0, 0, 0}; + DataCopyPad(gradSupportFeaturesGM[idxOfSupportXyz * this->cIn + repeatTimes * this->numCEachGrid], gradSupportFeaturesLocal, copyTailParams); + SetAtomicNone(); + } + } + inQueueGroupedIdxs.FreeTensor(groupedIdxsLocal); + inQueuePointCntOfGrid.FreeTensor(numTotalPtsLocal); + inQueueGradNewFeatures.FreeTensor(gradNewFeaturesLocal); + outQueueGradSupportFeatures.FreeTensor(gradSupportFeaturesLocal); + } + +private: + TPipe pipe; + TQue inQueueGradNewFeatures, inQueuePointCntOfGrid, inQueueGroupedIdxs; + TQue outQueueGradSupportFeatures; + GlobalTensor gradNewFeaturesGM, gradSupportFeaturesGM; + GlobalTensor pointCntOfGridGM, groupedIdxsGM; + + uint32_t formerCoreGroups; + uint32_t formerCoreData; + uint32_t usedCoreNum; + uint32_t availableUbSize; + uint32_t mainGroups; + uint32_t copyLoop; + uint32_t copyTail; + uint32_t formerTailGroups; + uint32_t lastCopyLoop; + uint32_t lastCopyTail; + uint32_t lastTailGroups; + uint32_t m; + uint32_t cOut; + uint32_t numTotalGrids; + uint32_t numCEachGrid; + uint32_t numMaxSumPoints; + uint32_t n; + uint32_t cIn; +}; + + +extern "C" __global__ __aicore__ void vec_pool_grad(GM_ADDR grad_new_features, GM_ADDR point_cnt_of_grid, GM_ADDR grouped_idxs, GM_ADDR grad_support_features, GM_ADDR workspace, GM_ADDR tiling) +{ + GET_TILING_DATA(tiling_data, tiling); + KernelVecPoolGrad op; + op.Init(grad_new_features, point_cnt_of_grid, grouped_idxs, grad_support_features, &tiling_data); + op.Process(); +} + + +#ifndef __CCE_KT_TEST__ +// call of kernel function +void vec_pool_grad_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* grad_new_features, uint8_t* point_cnt_of_grid, uint8_t* grouped_idxs, uint8_t* grad_support_features) +{ + vec_pool_grad<<>>(grad_new_features, point_cnt_of_grid, grouped_idxs, grad_support_features); +} +#endif