diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..379eefb8e9e8cbef43f5441cb6337a1d9f353f02 --- /dev/null +++ b/.clang-format @@ -0,0 +1,15 @@ +BasedOnStyle: Google +ColumnLimit: 120 +IndentWidth: 4 +SortIncludes: false + +# poniter aligment +DerivePointerAlignment: false +PointerAlignment: Left + +# single line layout +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: false +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false diff --git a/.gitignore b/.gitignore index 658649e038829e59312962f906c0013e1bf654b2..3d91b143a2f6b6a59385579c9dc1c14eed6ea0f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ __pycache__/ -.DS_Store \ No newline at end of file +.DS_Store diff --git a/ads/common/ops/bev_pool.py b/ads/common/ops/bev_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..1a1cd2924ce14116fec3e02e275ac486da389a55 --- /dev/null +++ b/ads/common/ops/bev_pool.py @@ -0,0 +1,60 @@ +import ads_c +import torch + + +class BEVPool(torch.autograd.Function): + @staticmethod + def forward(ctx, feats, geom_feats, ranks, B, D, H, W): + kept = torch.ones(feats.shape[0], device=feats.device, dtype=torch.bool) + kept[1:] = ranks[1:] != ranks[:-1] + interval_starts = torch.where(kept)[0].int() + interval_lengths = torch.zeros_like(interval_starts, dtype=torch.int64) + interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] + interval_lengths[-1] = feats.shape[0] - interval_starts[-1] + geom_feats = geom_feats.int() + + out = ads_c.npu_bev_pool( + feats, + geom_feats, + interval_lengths, + interval_starts, + B, + D, + H, + W, + ) + + ctx.save_for_backward(interval_starts, interval_lengths, geom_feats) + ctx.saved_shapes = B, D, H, W + return out + + @staticmethod + def backward(ctx, grad_out): + interval_starts, interval_lengths, geom_feats = ctx.saved_tensors + B, D, H, W = ctx.saved_shapes + + grad_out = grad_out.contiguous() + grad_feats = ads_c.npu_bev_pool_backward( + grad_out, + geom_feats, + interval_lengths, + interval_starts, + B, + D, + H, + W, + ) + + return grad_feats, None, None, None, None, None, None + + +def bev_pool(feats, geom_feats, B, D, H, W): + assert feats.shape[0] == geom_feats.shape[0] + + ranks = geom_feats[:, 0] * (W * D * B) + geom_feats[:, 1] * (D * B) + geom_feats[:, 2] * B + geom_feats[:, 3] + indices = ranks.argsort() + feats, geom_feats, ranks = feats[indices], geom_feats[indices], ranks[indices] + + out = BEVPool.apply(feats, geom_feats, ranks, B, D, H, W) + out = out.permute(0, 4, 1, 2, 3).contiguous() + return out diff --git a/ads/common/ops/csrc/BEVPoolBackwardKernelNpu.cpp b/ads/common/ops/csrc/BEVPoolBackwardKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..78537c4019164d291c4ae7c05d65dc4b1d7e5b36 --- /dev/null +++ b/ads/common/ops/csrc/BEVPoolBackwardKernelNpu.cpp @@ -0,0 +1,55 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/OpCommand.h" +#include "torch_npu/csrc/framework/utils/OpPreparation.h" +#include "torch_npu/csrc/framework/utils/NpuUtils.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/aten/CustomFunctions.h" +#include "functions.h" +#include "common.h" +#include "OpApiCommon.h" + +namespace { +constexpr int64_t N_IDX = 0; +constexpr int64_t C_IDX = 4; +constexpr int64_t N_INTERVAL_IDX = 0; +} // namespace +/* + * Function: pilar pooing, bev_pool_backward + * Args: + * grad_out: input grad, 5D tensor(b, d, h, w, c) + * geom_feats: input coords, 2D tensor(n, 4) + * interval_lengths: the number of points in each interval, 1D tensor(n_interval) + * interval_starts: starting position for pooled point, 1D tensor(n_interval) + * Return: + * grad_feats: output grad, 5D tensor(b, d, h, w, c) + */ +at::Tensor npu_bev_pool_backward(const at::Tensor grad_out, const at::Tensor geom_feats, + const at::Tensor interval_lengths, const at::Tensor interval_starts, int64_t b, + int64_t d, int64_t h, int64_t w) { + TORCH_CHECK(grad_out.dim() == 5, "grad_out must be 5D tensor(b, d, h, w, c)"); + TORCH_CHECK(geom_feats.dim() == 2, "coords must be 2D tensor(n, 4)"); + auto n = geom_feats.size(N_IDX); + auto c = grad_out.size(C_IDX); + auto n_interval = interval_lengths.size(N_INTERVAL_IDX); + TORCH_CHECK(interval_starts.size(N_INTERVAL_IDX) == n_interval, + "interval_starts and interval_lengths must have same size"); + + auto grad_feats = at::zeros({n, c}, grad_out.options()); + EXEC_NPU_CMD(aclnnBEVPoolGrad, grad_out, geom_feats, interval_lengths, interval_starts, b, d, h, w, n, c, + grad_feats); +} diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h index 1384a1538dd30108b58583a791c79ad0bf2690b1..e06051a56822471aca5d89ed631123435f068a15 100644 --- a/ads/common/ops/csrc/functions.h +++ b/ads/common/ops/csrc/functions.h @@ -14,124 +14,70 @@ #ifndef __FUNCTIONS_H__ #define __FUNCTIONS_H__ -#include #include +#include #include +#include #include #include -#include -void init_common(pybind11::module &m); +void init_common(pybind11::module& m); -std::tuple npu_scatter_max(const at::Tensor& updates, const at::Tensor& indices, c10::optional out); +std::tuple npu_scatter_max(const at::Tensor& updates, const at::Tensor& indices, + c10::optional out); at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments); -at::Tensor npu_rotated_box_decode(const at::Tensor &self, const at::Tensor &deltas, const at::Tensor &weight); -at::Tensor npu_rotated_box_encode( - const at::Tensor& self, - const at::Tensor& gtBox, - const at::Tensor& weight); -at::Tensor npu_rotated_iou( - const at::Tensor& boxes, - const at::Tensor& query_boxes, - bool trans, - int64_t mode, - bool is_cross, - double v_threshold, - double e_threshold); -at::Tensor npu_rotated_overlaps( - const at::Tensor& self, - const at::Tensor& query_boxes, - bool trans); +at::Tensor npu_rotated_box_decode(const at::Tensor& self, const at::Tensor& deltas, const at::Tensor& weight); +at::Tensor npu_rotated_box_encode(const at::Tensor& self, const at::Tensor& gtBox, const at::Tensor& weight); +at::Tensor npu_rotated_iou(const at::Tensor& boxes, const at::Tensor& query_boxes, bool trans, int64_t mode, + bool is_cross, double v_threshold, double e_threshold); +at::Tensor npu_rotated_overlaps(const at::Tensor& self, const at::Tensor& query_boxes, bool trans); at::Tensor npu_scatter(const at::Tensor& self, const at::Tensor& indices, const at::Tensor& updates, int64_t dim); at::Tensor npu_sign_bits_pack(const at::Tensor& self, int64_t size); at::Tensor npu_sign_bits_unpack(py::args args); -at::Tensor npu_softmax_cross_entropy_with_logits(const at::Tensor &self, const at::Tensor &lables); +at::Tensor npu_softmax_cross_entropy_with_logits(const at::Tensor& self, const at::Tensor& lables); at::Tensor npu_stride_add(py::args args); -at::Tensor npu_transpose(const at::Tensor &self, at::IntArrayRef perm, bool require_contiguous); -at::Tensor npu_yolo_boxes_encode( - const at::Tensor& anchor_boxes, - const at::Tensor& gt_bboxes, - const at::Tensor& stride, - bool performance_mode); +at::Tensor npu_transpose(const at::Tensor& self, at::IntArrayRef perm, bool require_contiguous); +at::Tensor npu_yolo_boxes_encode(const at::Tensor& anchor_boxes, const at::Tensor& gt_bboxes, const at::Tensor& stride, + bool performance_mode); at::Tensor npu_scatter(const at::Tensor& self, const at::Tensor& indices, const at::Tensor& updates, int64_t dim); -at::Tensor npu_rotary_mul(const at::Tensor &self, const at::Tensor &r1, const at::Tensor &r2); +at::Tensor npu_rotary_mul(const at::Tensor& self, const at::Tensor& r1, const at::Tensor& r2); at::Tensor npu_silu(const at::Tensor& self); at::Tensor& npu_silu_(at::Tensor& self); at::Tensor npu_abs(const at::Tensor& self); at::Tensor npu_fast_gelu_backward(const at::Tensor& grad, const at::Tensor& self); at::Tensor npu_fast_gelu(const at::Tensor& self); -at::Tensor npu_anchor_response_flags(const at::Tensor& self, at::IntArrayRef featmap_size, at::IntArrayRef stride, int64_t num_base_anchors); -at::Tensor npu_bounding_box_decode( - const at::Tensor& rois, - const at::Tensor& deltas, - double means0, - double means1, - double means2, - double means3, - double stds0, - double stds1, - double stds2, - double stds3, - at::IntArrayRef max_shape, - double wh_ratio_clip); -at::Tensor npu_bounding_box_encode( - const at::Tensor& anchor_box, - const at::Tensor& ground_truth_box, - double means0, - double means1, - double means2, - double means3, - double stds0, - double stds1, - double stds2, - double stds3); +at::Tensor npu_anchor_response_flags(const at::Tensor& self, at::IntArrayRef featmap_size, at::IntArrayRef stride, + int64_t num_base_anchors); +at::Tensor npu_bounding_box_decode(const at::Tensor& rois, const at::Tensor& deltas, double means0, double means1, + double means2, double means3, double stds0, double stds1, double stds2, double stds3, + at::IntArrayRef max_shape, double wh_ratio_clip); +at::Tensor npu_bounding_box_encode(const at::Tensor& anchor_box, const at::Tensor& ground_truth_box, double means0, + double means1, double means2, double means3, double stds0, double stds1, + double stds2, double stds3); std::tuple npu_batch_nms( - const at::Tensor& self, - const at::Tensor& scores, - double score_threshold, - double iou_threshold, - int64_t max_size_per_class, - int64_t max_total_size, - bool change_coordinate_frame, - bool transpose_box); -at::Tensor npu_confusion_transpose( - const at::Tensor& self, - at::IntArrayRef perm, - at::IntArrayRef shape, - bool transpose_first); -at::Tensor npu_confusion_transpose_backward( - const at::Tensor& grad, - at::IntArrayRef perm, - at::IntArrayRef shape, - bool transpose_first); -at::Tensor npu_conv_transpose2d( - const at::Tensor& input, - const at::Tensor& weight, - const c10::optional& bias_opt, - at::IntArrayRef padding, - at::IntArrayRef output_padding, - at::IntArrayRef stride, - at::IntArrayRef dilation, - int64_t groups); + const at::Tensor& self, const at::Tensor& scores, double score_threshold, double iou_threshold, + int64_t max_size_per_class, int64_t max_total_size, bool change_coordinate_frame, bool transpose_box); +at::Tensor npu_confusion_transpose(const at::Tensor& self, at::IntArrayRef perm, at::IntArrayRef shape, + bool transpose_first); +at::Tensor npu_confusion_transpose_backward(const at::Tensor& grad, at::IntArrayRef perm, at::IntArrayRef shape, + bool transpose_first); +at::Tensor npu_conv_transpose2d(const at::Tensor& input, const at::Tensor& weight, + const c10::optional& bias_opt, at::IntArrayRef padding, + at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, + int64_t groups); at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size); at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result); -at::Tensor npu_moe_tutel( - const at::Tensor &self, - const at::Tensor &gates, - const at::Tensor &indices, - const at::Tensor &locations, - int64_t capacity); -at::Tensor npu_moe_tutel_data_backward( - const at::Tensor &y_grad, - const at::Tensor &gates, - const at::Tensor &indices, - const at::Tensor &locations); -at::Tensor npu_moe_tutel_gate_backward( - const at::Tensor &self, - const at::Tensor &y_grad, - const at::Tensor &indices, - const at::Tensor &locations); +at::Tensor npu_moe_tutel(const at::Tensor& self, const at::Tensor& gates, const at::Tensor& indices, + const at::Tensor& locations, int64_t capacity); +at::Tensor npu_moe_tutel_data_backward(const at::Tensor& y_grad, const at::Tensor& gates, const at::Tensor& indices, + const at::Tensor& locations); +at::Tensor npu_moe_tutel_gate_backward(const at::Tensor& self, const at::Tensor& y_grad, const at::Tensor& indices, + const at::Tensor& locations); -at::Tensor npu_ads_add(const at::Tensor &tensor1, const at::Tensor &tensor2); -#endif // __FUNCTIONS_H__ +at::Tensor npu_ads_add(const at::Tensor& tensor1, const at::Tensor& tensor2); +at::Tensor npu_bev_pool(const at::Tensor feats, const at::Tensor geom_feats, const at::Tensor interval_lengths, + const at::Tensor interval_starts, int64_t b, int64_t d, int64_t h, int64_t w); +at::Tensor npu_bev_pool_backward(const at::Tensor grad_out, const at::Tensor geom_feats, const at::Tensor interval_lengths, + const at::Tensor interval_starts, int64_t b, int64_t d, int64_t h, int64_t w); +#endif // __FUNCTIONS_H__ diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp index 5638ac6b09c986dba4e68e9fe99862475dbaa323..630f1a958cedd882c025cb570e19ca3cd97e752a 100644 --- a/ads/common/ops/csrc/pybind.cpp +++ b/ads/common/ops/csrc/pybind.cpp @@ -73,4 +73,8 @@ void init_common(pybind11::module &m) m.def("npu_moe_tutel_gate_backward", &npu_moe_tutel_gate_backward, "npu_moe_tutel_gate_backward NPU version"); // ads_add m.def("npu_ads_add", &npu_ads_add); + + // bev_pool + m.def("npu_bev_pool", &npu_bev_pool, "npu_bev_pool NPU version"); + m.def("npu_bev_pool_backward", &npu_bev_pool_backward, "npu_bev_pool_backward NPU version"); } diff --git a/ads/common/ops/kernels/ads_op/op_host/CMakeLists.txt b/ads/common/ops/kernels/ads_op/op_host/CMakeLists.txt index 40dd51cfac524b0a9607b7d8b2813edd2210c509..b467eef89d52edbb521326ba3ad01afe5afacb47 100644 --- a/ads/common/ops/kernels/ads_op/op_host/CMakeLists.txt +++ b/ads/common/ops/kernels/ads_op/op_host/CMakeLists.txt @@ -1,4 +1,3 @@ - aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs) opbuild(OPS_SRC ${ops_srcs} @@ -28,6 +27,8 @@ target_link_libraries(cust_op_proto PRIVATE set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME cust_opsproto_rt2.0 ) + +# tiling add_library(cust_optiling SHARED ${ops_srcs}) target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) target_compile_options(cust_optiling PRIVATE diff --git a/ads/common/ops/kernels/ads_op/op_host/add_custom.cpp b/ads/common/ops/kernels/ads_op/op_host/add_custom.cpp index 3a578d4ed44e8102b5285948093a22d88d88fa4e..9924452155aa14feaf3d5bcdd411eddae31365e4 100644 --- a/ads/common/ops/kernels/ads_op/op_host/add_custom.cpp +++ b/ads/common/ops/kernels/ads_op/op_host/add_custom.cpp @@ -1,4 +1,3 @@ - #include "add_custom_tiling.h" #include "register/op_def_registry.h" diff --git a/ads/common/ops/kernels/ads_op/op_host/bev_pool.cpp b/ads/common/ops/kernels/ads_op/op_host/bev_pool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d872093af06d708738b19b210f7d5c1d1eba9681 --- /dev/null +++ b/ads/common/ops/kernels/ads_op/op_host/bev_pool.cpp @@ -0,0 +1,133 @@ +#include +#include +#include +#include "bev_pool_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +namespace { +constexpr size_t INTERVAL_START_IDX = 2; +constexpr size_t B_IDX = 0; +constexpr size_t D_IDX = 1; +constexpr size_t H_IDX = 2; +constexpr size_t W_IDX = 3; +constexpr size_t N_IDX = 4; +constexpr size_t C_IDX = 5; +} // namespace +namespace optiling { +static ge::graphStatus TilingForBEVPoolGrad(gert::TilingContext* context) { + BEVPoolTilingData tiling; + auto platform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + auto coreNum = platform.GetCoreNum(); + + auto intervalShape = context->GetInputShape(INTERVAL_START_IDX); + int32_t nInterval = intervalShape->GetStorageShape().GetDim(0); + + int32_t usedCoreNum = std::min(int32_t(coreNum), nInterval); + tiling.set_usedCoreNum(usedCoreNum); + auto avgTaskNum = nInterval / usedCoreNum; + auto tailTaskNum = nInterval % usedCoreNum; + tiling.set_avgTaskNum(avgTaskNum); + tiling.set_tailTaskNum(tailTaskNum); + + auto attrs = context->GetAttrs(); + // TODO: check if attrs is null + auto getAttr = [attrs](size_t idx) -> int32_t { + auto ptr = attrs->GetInt(idx); + // TODO: check if ptr is null + return int32_t(*ptr); + }; + auto b = getAttr(B_IDX); + auto d = getAttr(D_IDX); + auto h = getAttr(H_IDX); + auto w = getAttr(W_IDX); + auto n = getAttr(N_IDX); + auto c = getAttr(C_IDX); + tiling.set_c(c); + tiling.set_stride0(w * c); + tiling.set_stride1(c); + tiling.set_stride2(h * w * c); + tiling.set_stride3(d * h * w * c); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static ge::graphStatus InferShapeForBEVPoolGrad(gert::InferShapeContext* context) { + const gert::Shape* x1_shape = context->GetInputShape(0); + gert::Shape* y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { + +/** + * @brief: BEVPoolGrad, the backward of bev_pool + * @par Inputs: + * grad_out: input grad, 5D tensor(b, d, h, w, c), dtype: float32, format: NDHWC, ND + * geom_feats: input coords, 2D tensor(n, 4), dtype: int32, format: ND + * interval_starts: starting position for pooled point, 1D tensor(n_interval), dtype: int32, format: ND + * interval_lengths: the number of points in each interval, 1D tensor(n_interval), dtype: int32, format: ND + * @par Outputs: + * grad_feats: output grad, 2D tensor(n, c), dtype: float32, format: ND + * @par Attributes: + * b: batch size, type: int + * d: depth, type: int + * w: width, type: int + * h: height, type: int + * n: number of points, type: int + * c: channels, type: int + **/ +class BEVPoolGrad : public OpDef { + public: + explicit BEVPoolGrad(const char* name) : OpDef(name) { + this->Input("grad_out") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_NDHWC, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("geom_feats") + .ParamType(REQUIRED) + .DataType({ge::DT_INT64}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("interval_starts") + .ParamType(REQUIRED) + .DataType({ge::DT_INT64}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("interval_lengths") + .ParamType(REQUIRED) + .DataType({ge::DT_INT64}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + + this->Output("grad_feats") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + + this->Attr("b").AttrType(REQUIRED).Int(); + this->Attr("d").AttrType(REQUIRED).Int(); + this->Attr("w").AttrType(REQUIRED).Int(); + this->Attr("h").AttrType(REQUIRED).Int(); + this->Attr("n").AttrType(REQUIRED).Int(); + this->Attr("c").AttrType(REQUIRED).Int(); + + this->SetInferShape(ge::InferShapeForBEVPoolGrad); + + this->AICore().SetTiling(optiling::TilingForBEVPoolGrad); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910"); + this->AICore().AddConfig("ascend310p"); + } +}; + +OP_ADD(BEVPoolGrad); +} // namespace ops diff --git a/ads/common/ops/kernels/ads_op/op_host/bev_pool_tiling.h b/ads/common/ops/kernels/ads_op/op_host/bev_pool_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..3ba28f6888f7ca486a7f28638ef6d82ea86c6722 --- /dev/null +++ b/ads/common/ops/kernels/ads_op/op_host/bev_pool_tiling.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#ifndef BEVPool_TILING_H +#define BEVPool_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(BEVPoolTilingData) +TILING_DATA_FIELD_DEF(int32_t, usedCoreNum) +TILING_DATA_FIELD_DEF(int32_t, avgTaskNum) +TILING_DATA_FIELD_DEF(int32_t, tailTaskNum) +TILING_DATA_FIELD_DEF(int32_t, c) +TILING_DATA_FIELD_DEF(int32_t, stride0) +TILING_DATA_FIELD_DEF(int32_t, stride1) +TILING_DATA_FIELD_DEF(int32_t, stride2) +TILING_DATA_FIELD_DEF(int32_t, stride3) +END_TILING_DATA_DEF + +REGISTER_TILING_DATA_CLASS(BEVPool, BEVPoolTilingData) +REGISTER_TILING_DATA_CLASS(BEVPoolGrad, BEVPoolTilingData) + +} // namespace optiling +#endif // BEVPool_TILING_H diff --git a/ads/common/ops/kernels/ads_op/op_kernel/bev_pool_grad.cpp b/ads/common/ops/kernels/ads_op/op_kernel/bev_pool_grad.cpp new file mode 100644 index 0000000000000000000000000000000000000000..113aa8d76da9f68b0fbfdac7e236944d735d66a3 --- /dev/null +++ b/ads/common/ops/kernels/ads_op/op_kernel/bev_pool_grad.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + */ +using namespace AscendC; +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue + +tmplate class TConjungateQue : public TQueBind {}; + +class KernelBEVPoolGrad { + public: + __aicore__ inline KernelBEVPoolGrad() = delete; + + __aicore__ inline KernelBEVPoolGrad(int32_t blkIdx, GM_ADDR gradOut, GM_ADDR geomFeats, GM_ADDR intervalStarts, + GM_ADDR intervalLengths, GM_ADDR gradFeats, + const BEVPoolTilingData& bevPoolTiling) { + blkIdx_ = blkIdx; + + stride0_ = bevPoolTiling.get_stride0(); + stride1_ = bevPoolTiling.get_stride1(); + stride2_ = bevPoolTiling.get_stride2(); + stride3_ = bevPoolTiling.get_stride3(); + c_ = bevPoolTiling.get_c(); + int32_t usedCoreNum = bevPoolTiling.get_usedCoreNum(); + int32_t avgTaskNum = bevPoolTiling.get_avgTaskNum(); + int32_t tailTaskNum = bevPoolTiling.get_tailTaskNum(); + it_ = TaskIterator(blkIdx_, usedCoreNum, avgTaskNum, tailTaskNum); + + cpOneParams_.blockLen = 1; + cpGradParams_.blockLen = DivCeil(c_, B32_BYTE_SIZE); + + sGm_.setGlobalBuffer((__gm__ float*)intervalStarts); + lGm_.setGlobalBuffer((__gm__ float*)intervalLengths); + oGm_.setGlobalBuffer((__gm__ float*)gradOut); + gGm_.setGlobalBuffer((__gm__ float*)gradFeats); + fGm_.setGlobalBuffer((__gm__ float*)geomFeats); + + pipe.InitBuffer(que_, BUFFER_NUM, AlignUp(c_, B32_BYTE_SIZE) * sizeof(float)); + pipe.InitBuffer(sQue, BUFFER_NUM, ONE_BLK_SIZE); + pipe.InitBuffer(lQue, BUFFER_NUM, ONE_BLK_SIZE); + pipe.InitBuffer(gQue, BUFFER_NUM, ONE_BLK_SIZE); + + evt1_ = pipe_.AllocEventID(); + evt2_ = pipe_.AllocEventID(); + } + + __aicore__ inline void Process() { + while (it_.hasNext()) { + int32_t idx = it_.next(); + int32_t start, length, offset; + CopyInInterval(idx); + GetInterval(idx, start, length); + CopyInGeom(start); + GetOffset(geomT, offset); + CopyInGrad(offset); + for (int32_t i = 0; i < l; ++l) { + DataCopy(gradOutGm_[i * stride0_ + offset], que_[i - start], c_); + } + } + } + + private: + class TaskIterator { + public: + __aicore__ inline TaskIterator(int32_t blkIdx, int32_t blkDim, int32_t avgTaskNum, int32_t tailTaskNum) { + blkIdx_ = blkIdx; + nextIdx_ = blkIdx * avgTaskNum + (blkIdx < tailTaskNum ? blkIdx : tailTaskNum); + endIdx_ = nextIdx_ + avgTaskNum + (blkIdx < tailTaskNum ? 1 : 0); + } + + __aicore__ inline bool HasNext() const { + return nextIdx_ < endIdx_; + } + + __aicore__ inline int32_t Next() { + return nextIdx_++; + } + + private: + int32_t blkIdx_, blkDim_; + int32_t nextIdx_, endIdx_; + }; + + private: + TPipe pipe_; + TConjungateQue que_; + TQue sQue_, lQue_, gQue_; + + GlobalTensor oGm_; + GlobalTensor fGm_; + GlobalTensor sGm_, lGm_, gGm_; + + int32_t stride0_, stride1_, stride2_, stride3_; + int32_t c_; + TaskIterator it_; + DataCopyParams cpOneParams_, cpGradParams_; + + TEventID evt1_, evt2_; + + private: + __aicore__ inline void CopyInInterval(int32_t idx) { + auto startT = sQue_.AllocTensor(); + auto lengthT = lQue_.AllocTensor(); + auto geomT = gQue_.AllocTensor(); + + DataCopy(startT, sGm_[idx], cpOneParams_); + sQue_.EnQue(startT); + DataCopy(lengthT, lGm_[idx], cpOneParams_); + lQue_.EnQue(lengthT); + DataCopy(geomT, gGm_[idx], cpOneParams_); + gQue_.EnQue(geomT); + } + + __aicore__ inline void GetInterval(int32_t idx, int32_t& start, int32_t& length) { + LocalTensor startT = sQue_.DeQue(); + LocalTensor lengthT = lQue_.DeQue(); + start = startT[0]; + length = lengthT[0]; + + SetFlag(evt1_); + } + + __aicore__ inline void CopyInGeom(int32_t start) { + WaitFlag(evt1_); + + auto geomT = gQue_.AllocTensor(); + DataCopy(geomT, gGm_[start], cpOneParams_); + gQue_.EnQue(geomT); + } + __aicore__ inline void GetOffset(int32_t idx, int32_t& offset) { + LocalTensor geomT = gQue_.DeQue(); + offset = geomT[0] * stride0_ + geomT[1] * stride1_ + geomT[2] * stride2_ + geomT[3] * stride3_; + + SetFlag(evt2_); + } + + __aicore__ inline void CopyInGrad(int32_t offset) { + WaitFlag(evt2_); + + auto gradOutT = que_.AllocTensor(); + DataCopy(gradOutT, oGm_[offset], cpGradParams_); + que_.EnQue(gradOutT); + } +}; + +extern "C" __global__ __aicore__ void bev_pool_grad(GM_ADDR gradOut, GM_ADDR geomFeats, GM_ADDR intervalStarts, + GM_ADDR intervalLengths, GM_ADDR gradFeats, GM_ADDR workspace, + GM_ADDR tiling) { + GET_TILING_DATA(bevPoolTiling, tiling); + if (!workspace) { + return; + } + SetSysWorkspace(workspace); + + int32_t blkIdx = GetBlockIdx(); + if (blkIdx >= bevPoolTiling.get_usedCoreNum()) { + return; + } + + KernelBEVPoolGrad op(blkIdx, gradOut, geomFeats, intervalStarts, intervalLengths, gradFeats, bevPoolTiling); + op.Process(); +} diff --git a/ads/include/op_log.h b/ads/include/op_log.h new file mode 100644 index 0000000000000000000000000000000000000000..ba0e9ff8fc09b0fd6b73fcde6f868b3d6c2ed7f6 --- /dev/null +++ b/ads/include/op_log.h @@ -0,0 +1,304 @@ +#ifndef OPS_COMMON_INC_OP_LOG_H_ +#define OPS_COMMON_INC_OP_LOG_H_ + +#include +#include +#include "graph/operator.h" +#include "graph/node.h" +#include "common/util/error_manager/error_manager.h" + +#if !defined( __ANDROID__) && !defined(ANDROID) +#include "toolchain/slog.h" +#else +#include +#endif + +#ifdef __GNUC__ +#include +#include +#else +#include "mmpa/mmpa_api.h" +#endif + +#define OPPROTO_SUBMOD_NAME "OP_PROTO" + +class OpLog { + public: + static uint64_t GetTid() { +#ifdef __GNUC__ + const uint64_t tid = static_cast(syscall(__NR_gettid)); +#else + const uint64_t tid = static_cast(GetCurrentThreadId()); +#endif + return tid; + } +}; + +inline const char* get_cstr(const std::string& str) { + return str.c_str(); +} + +inline const char* get_cstr(const char* str) { + return str; +} + +inline const std::string& get_op_info(const std::string& str) { + return str; +} + +inline const char* get_op_info(const char* str) { + return str; +} + +inline std::string get_op_info(const ge::NodePtr& node) { + return node != nullptr ? node->GetType() + ":" + node->GetName() : "nil"; +} + +inline std::string get_op_info(const ge::OpDescPtr& node) { + return node != nullptr ? node->GetType() + ":" + node->GetName() : "nil"; +} + +template +constexpr bool is_ge_operator_type() { + return std::is_base_of::type>::value; +} + +template +typename std::enable_if(), std::string>::type get_op_info(const T& op) { + ge::AscendString name; + ge::AscendString type; + auto get_name_ret = op.GetName(name); + auto get_type_ret = op.GetOpType(type); + std::string op_info = get_type_ret == ge::GRAPH_SUCCESS ? type.GetString() : "nil"; + op_info += ":"; + op_info += get_name_ret == ge::GRAPH_SUCCESS ? name.GetString() : "nil"; + return op_info; +} + +template +constexpr bool is_context_type() { + return !std::is_base_of::type>::value && + !std::is_same::type>::value && + !std::is_same::type>::value && + !std::is_same::type>::value && + !std::is_same::type>::value && + !std::is_same::type>::value; +} + +template +typename std::enable_if(), std::string>::type get_op_info(T context) { + if (context == nullptr) { + return "nil:nil"; + } + std::string op_info = context->GetNodeType() != nullptr ? context->GetNodeType() : "nil"; + op_info += ":"; + op_info += context->GetNodeName() != nullptr ? context->GetNodeName() : "nil"; + return op_info; +} + +template +std::string TbeGetName(const T& op) { + ge::AscendString op_ascend_name; + ge::graphStatus ret = op.GetName(op_ascend_name); + if (ret != ge::GRAPH_SUCCESS) { + std::string op_name = "None"; + return op_name; + } + return op_ascend_name.GetString(); +} + +template +std::string TbeGetOpType(const T& op) { + ge::AscendString op_ascend_name; + ge::graphStatus ret = op.GetOpType(op_ascend_name); + if (ret != ge::GRAPH_SUCCESS) { + std::string op_name = "None"; + return op_name; + } + return op_ascend_name.GetString(); +} + +#define CHECK_DIVISOR_ZERO(divisor) \ + if (divisor == 0) {\ + return;\ + } + +#define CHECK_DIVISOR_ZERO_RET(divisor, ret) \ + if (divisor == 0) { \ + return ret; \ + } + +#define OP_CHECK(cond, log_func, return_expr) \ + if (cond) { \ + log_func; \ + return_expr; \ + } + +#if !defined( __ANDROID__) && !defined(ANDROID) +#define AICPU_OP_LOGI(opname, ...) AICPU_D_OP_LOGI(get_op_info(opname), __VA_ARGS__) +#define AICPU_OP_LOGW(opname, ...) AICPU_D_OP_LOGW(get_op_info(opname), __VA_ARGS__) +#define AICPU_OP_LOGD(opname, ...) AICPU_D_OP_LOGD(get_op_info(opname), __VA_ARGS__) +#define AICPU_OP_LOGE_WITHOUT_REPORT(opname, ...) AICPU_D_OP_LOGE(get_op_info(opname), __VA_ARGS__) +#define AICPU_OP_LOGE(op_name, ...) \ + do { \ + AICPU_OP_LOGE_WITHOUT_REPORT(op_name, ##__VA_ARGS__); \ + REPORT_INNER_ERROR("EZ9999", ##__VA_ARGS__); \ + } while (0) + +#define OP_LOGI(opname, ...) D_OP_LOGI(get_op_info(opname), __VA_ARGS__) +#define OP_LOGW(opname, ...) D_OP_LOGW(get_op_info(opname), __VA_ARGS__) + +#define OP_LOGE_WITHOUT_REPORT(opname, ...) D_OP_LOGE(get_op_info(opname), __VA_ARGS__) +#define OP_LOGE(op_name, ...) \ + do { \ + OP_LOGE_WITHOUT_REPORT(op_name, ##__VA_ARGS__); \ + REPORT_INNER_ERROR("EZ9999", ##__VA_ARGS__); \ + } while (0) + +#define OP_LOGD(opname, ...) D_OP_LOGD(get_op_info(opname), __VA_ARGS__) +#define OP_EVENT(opname, ...) D_OP_EVENT(get_op_info(opname), __VA_ARGS__) +#define GE_OP_LOGI(opname, ...) GE_D_OP_LOGI(get_op_info(opname), __VA_ARGS__) +#define GE_OP_LOGW(opname, ...) GE_D_OP_LOGW(get_op_info(opname), __VA_ARGS__) +#define GE_OP_LOGE(opname, ...) GE_D_OP_LOGE(get_op_info(opname), __VA_ARGS__) +#define GE_OP_LOGD(opname, ...) GE_D_OP_LOGD(get_op_info(opname), __VA_ARGS__) +#define GE_OP_LOGE_WITH_REPORT(opname, format, ...) \ + do { \ + GE_OP_LOGE(opname, format, ##__VA_ARGS__); \ + REPORT_INNER_ERROR("E19999", "[OpName:%s]" format, get_cstr(get_op_info(opname)), ##__VA_ARGS__); \ + } while (0) + +#define FUSION_PASS_LOGI(...) D_FUSION_PASS_LOGI(__VA_ARGS__) +#define FUSION_PASS_LOGW(...) D_FUSION_PASS_LOGW(__VA_ARGS__) +#define FUSION_PASS_LOGE(...) D_FUSION_PASS_LOGE(__VA_ARGS__) +#define FUSION_PASS_LOGD(...) D_FUSION_PASS_LOGD(__VA_ARGS__) +#else +#define AICPU_OP_LOGI(opname, ...) +#define AICPU_OP_LOGW(opname, ...) +#define AICPU_OP_LOGE(opname, ...) +#define AICPU_OP_LOGD(opname, ...) +#define AICPU_OP_LOGE_WITHOUT_REPORT(opname, ...) +#define OP_LOGI(opname, ...) +#define OP_LOGW(opname, ...) +#define OP_LOGE_WITHOUT_REPORT(opname, ...) +#define OP_LOGE(opname, ...) +#define OP_LOGD(opname, ...) +#define OP_EVENT(opname, ...) +#define FUSION_PASS_LOGI(...) +#define FUSION_PASS_LOGW(...) +#define FUSION_PASS_LOGE(...) +#define FUSION_PASS_LOGD(...) +#endif + +#define OpLogSub(moduleId, level, op_info, fmt, ...) \ + DlogSub(static_cast(moduleId), OPPROTO_SUBMOD_NAME, level, "[%s][%" PRIu64 "] OpName:[%s] " #fmt, __FUNCTION__, \ + OpLog::GetTid(), get_cstr(op_info), ##__VA_ARGS__) + +#if !defined( __ANDROID__) && !defined(ANDROID) +#define AICPU_D_OP_LOGI(opname, fmt, ...) OpLogSub(AICPU, DLOG_INFO, opname, fmt, ##__VA_ARGS__) +#define AICPU_D_OP_LOGW(opname, fmt, ...) OpLogSub(AICPU, DLOG_WARN, opname, fmt, ##__VA_ARGS__) +#define AICPU_D_OP_LOGE(opname, fmt, ...) OpLogSub(AICPU, DLOG_ERROR, opname, fmt, ##__VA_ARGS__) +#define AICPU_D_OP_LOGD(opname, fmt, ...) OpLogSub(AICPU, DLOG_DEBUG, opname, fmt, ##__VA_ARGS__) +#define D_OP_LOGI(opname, fmt, ...) OpLogSub(OP, DLOG_INFO, opname, fmt, ##__VA_ARGS__) +#define D_OP_LOGW(opname, fmt, ...) OpLogSub(OP, DLOG_WARN, opname, fmt, ##__VA_ARGS__) +#define D_OP_LOGE(opname, fmt, ...) OpLogSub(OP, DLOG_ERROR, opname, fmt, ##__VA_ARGS__) +#define D_OP_LOGD(opname, fmt, ...) OpLogSub(OP, DLOG_DEBUG, opname, fmt, ##__VA_ARGS__) +#define D_OP_EVENT(opname, fmt, ...) OpLogSub(OP, DLOG_EVENT, opname, fmt, ##__VA_ARGS__) +#define GE_D_OP_LOGI(opname, fmt, ...) OpLogSub(GE, DLOG_INFO, opname, fmt, ##__VA_ARGS__) +#define GE_D_OP_LOGW(opname, fmt, ...) OpLogSub(GE, DLOG_WARN, opname, fmt, ##__VA_ARGS__) +#define GE_D_OP_LOGE(opname, fmt, ...) OpLogSub(GE, DLOG_ERROR, opname, fmt, ##__VA_ARGS__) +#define GE_D_OP_LOGD(opname, fmt, ...) OpLogSub(GE, DLOG_DEBUG, opname, fmt, ##__VA_ARGS__) +#define D_FUSION_PASS_LOGI(fmt, ...) \ + DlogSub(FE, OPPROTO_SUBMOD_NAME, DLOG_INFO, " %s:%d " #fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define D_FUSION_PASS_LOGW(fmt, ...) \ + DlogSub(FE, OPPROTO_SUBMOD_NAME, DLOG_WARN, " %s:%d " #fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define D_FUSION_PASS_LOGE(fmt, ...) \ + DlogSub(FE, OPPROTO_SUBMOD_NAME, DLOG_ERROR, " %s:%d " #fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#define D_FUSION_PASS_LOGD(fmt, ...) \ + DlogSub(FE, OPPROTO_SUBMOD_NAME, DLOG_DEBUG, " %s:%d " #fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#else +#define AICPU_D_OP_LOGI(opname, fmt, ...) +#define AICPU_D_OP_LOGW(opname, fmt, ...) +#define AICPU_D_OP_LOGE(opname, fmt, ...) +#define AICPU_D_OP_LOGD(opname, fmt, ...) +#define D_OP_LOGI(opname, fmt, ...) +#define D_OP_LOGW(opname, fmt, ...) +#define D_OP_LOGE(opname, fmt, ...) +#define D_OP_LOGD(opname, fmt, ...) +#define D_OP_EVENT(opname, fmt, ...) +#define D_FUSION_PASS_LOGI(fmt, ...) +#define D_FUSION_PASS_LOGW(fmt, ...) +#define D_FUSION_PASS_LOGE(fmt, ...) +#define D_FUSION_PASS_LOGD(fmt, ...) +#endif + +#define unlikely(x) __builtin_expect((x), 0) +#define likely(x) __builtin_expect((x), 1) + +#define OP_LOGE_IF(condition, return_value, op_name, fmt, ...) \ + static_assert(std::is_same::type>::value, "condition should be bool"); \ + do { \ + if (unlikely(condition)) { \ + OP_LOGE(op_name, fmt, ##__VA_ARGS__); \ + return return_value; \ + } \ + } while (0) + +#define OP_LOGW_IF(condition, op_name, fmt, ...) \ + static_assert(std::is_same::type>::value, "condition should be bool"); \ + do { \ + if (unlikely(condition)) { \ + OP_LOGW(op_name, fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define OP_LOGI_IF_RETURN(condition, return_value, op_name, fmt, ...) \ + static_assert(std::is_same::type>::value, "condition should be bool"); \ + do { \ + if (unlikely(condition)) { \ + OP_LOGI(op_name, fmt, ##__VA_ARGS__); \ + return return_value; \ + } \ + } while (0) + +constexpr const int OP_MAX_LOG_SIZE = 16000; +constexpr const int OP_MSG_HEADER_LEN = 200; +// print very long log. long line will be split to multipile lines +#define OP_LOG_FULL(level, opname, format, ...) \ +do { \ + if (0 == CheckLogLevel(OP, level)) { \ + break; \ + } \ + char msgbufxyz[OP_MAX_LOG_SIZE]; \ + size_t msgmaxlen = (MSG_LENGTH - OP_MSG_HEADER_LEN); \ + int rettmp = snprintf_s(msgbufxyz, sizeof(msgbufxyz), sizeof(msgbufxyz) - 1, format, ##__VA_ARGS__); \ + if (rettmp == -1) { \ + msgbufxyz[sizeof(msgbufxyz) - 1] = '\0'; \ + } \ + size_t msglength = std::strlen(msgbufxyz); \ + if (msglength < msgmaxlen) { \ + OpLogSub(OP, level, opname, "%s", msgbufxyz); \ + break; \ + } \ + char *msgchunkbegin = msgbufxyz; \ + char *msgchunkend = nullptr; \ + while (msgchunkbegin < msgbufxyz + msglength) { \ + if (msgchunkbegin[0] == '\n') { \ + OpLogSub(OP, level, opname, ""); \ + msgchunkbegin += 1; \ + continue; \ + } \ + msgchunkend = std::strchr(msgchunkbegin, '\n'); \ + if (msgchunkend == nullptr) { \ + msgchunkend = msgchunkbegin + std::strlen(msgchunkbegin); \ + } \ + while (msgchunkend > msgchunkbegin) { \ + std::string msgchunk(msgchunkbegin, std::min(msgmaxlen, static_cast(msgchunkend - msgchunkbegin))); \ + OpLogSub(OP, level, opname, "%s", msgchunk.c_str()); \ + msgchunkbegin += msgchunk.size(); \ + } \ + msgchunkbegin += 1; \ + } \ +} while (0) + +#define OP_LOGD_FULL(opname, ...) OP_LOG_FULL(DLOG_DEBUG, get_op_info(opname), __VA_ARGS__) +#endif // OPS_COMMON_INC_OP_LOG_H_