diff --git a/README.md b/README.md index 813f1bce23cd61d357fd962687844d7e8a92d8a0..35988550e488d2e43b7ced87511d35d07d2aaa6c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,94 @@ -## Description +### 简介 -+ This repo provides some speed-up abilites for AI-models in autonomous driving system, which can help these models running fatser on ascend device. +本项目基于昇腾NPU开发了用于自动驾驶场景的高性能算子 + +### 编译、安装ADS + +#### 发布包安装 + +暂未正式发布 + +#### 源码安装 + +**安装依赖** + +> 安装对应的版本的torch、torch_npu、cann包,具体配套关系见pytorch仓(https://gitee.com/ascend/pytorch)首页readme +> +> 并source cann包环境变量 + +##### 下载ADS + +```shell +# 下载ads仓 +git clone https://gitee.com/ascend/ads.git +``` + +##### 编译ADS + +```shell +# 编译 +bash ads/ci/build.sh --python=3.7 +``` + +| 架构 | pytorch版本 | 出包版本 | +| ------- | ------------ | -------------------------------------------------------- | +| x86 | pytorch1.11 | Python3.7(\>=3.7.5), Python3.8, Python3.9, Python3.10 | +| x86 | pytorch2.0.1 | Python3.8, Python3.9, Python3.10 | +| x86 | pytorch2.1.0 | Python3.8, Python3.9, Python3.10 | +| aarch64 | pytorch1.11 | Python3.7(\>=3.7.5), Python3.8, Python3.9, Python3.10 | +| aarch64 | pytorch2.0.1 | Python3.8, Python3.9, Python3.10 | +| aarch64 | pytorch2.1.0 | Python3.8, Python3.9, Python3.10 | + +| 参数 | 取值范围 | 说明 | 缺省值 | 备注 | +| ------ | ------------------------------------------------------------ | ------------------------------ | ------ | ---------------------------------------------- | +| python | pytorch1.11,支持3.7及以上;pytorch1.11以上版本,支持3.8及以上 | 指定编译过程中使用的python版本 | 3.7 | 仅pytorch版本为1.11时才支持指定python版本为3.7 | + +##### 安装ADS + +```shell +cd ads/dist +pip3 install ads-1.0-cp37-cp37m-linux_aarch64.whl +``` + +#### CMC取包安装 + +当前ADS包还未商发,需到https://cmc-szv.clouddragon.huawei.com/cmcversion/index/search 搜索 FrameworkPTAdapter V100R001C01B001 取最新的包即可,注意需要根据环境的torch版本和python版本选择下载,如 ADS_v1.11.0_py37.tar.gz,其中v1.11.0表示torch版本,py37表示python版本。 + +![img](file:///C:/Users/c30030097/AppData/Roaming/eSpace_Desktop/UserData/c30030097/imagefiles/originalImgfiles/8A534D96-078D-44F1-852C-E0B8CD4F3074.png) + +后续计划发包版本 + +| 架构 | pytorch版本 | 出包版本 | +| ------- | ------------ | -------------------------------------------------------- | +| x86 | pytorch1.11 | Python3.7(\>=3.7.5), Python3.8, Python3.9, Python3.10 | +| x86 | pytorch2.0.1 | Python3.8, Python3.9, Python3.10 | +| x86 | pytorch2.1.0 | Python3.8, Python3.9, Python3.10 | +| aarch64 | pytorch1.11 | Python3.7(\>=3.7.5), Python3.8, Python3.9, Python3.10 | +| aarch64 | pytorch2.0.1 | Python3.8, Python3.9, Python3.10 | +| aarch64 | pytorch2.1.0 | Python3.8, Python3.9, Python3.10 | + +### ADS算子调用 + +##### 设置环境变量 + +注意:其中xxx表示当前环境上的python安装路径 + +```bash +export ASCEND_CUSTOM_OPP_PATH=xxx/site-packages/ads/common/ops/kernels/ads_op_kernel/packages/vendors/customize/ +export LD_LIBRARY_PATH=xxx/site-packages/ads/common/ops/kernels/ads_op_kernel/packages/vendors/customize/op_api/lib/:$LD_LIBRARY_PATH +``` + +算子调用 + +```python +import torch +import torch_npu +import numpy as np +import ads.common +device = torch.device("npu:5") +a=torch.rand([8, 2048]).half().npu() +b=torch.rand([8, 2048]).half().npu() +c = ads.common.npu_ads_add(a,b) +print(c) +``` -+ The main three parts is as follows: - + **common**, general modules, which contain some fused pytorch modules of which provide speed-up ability for models runnning on ascend device. - + **motion**, the motion prediction modules, which are used for trace-planning and motion-prediction. The modules contain some pytorch-custom-apis, the kernels of which are affinitive on ascend device. - + **perception**, the perception modules, which are used for 3D detection and segmentation. It helps recognize the components on the road. \ No newline at end of file diff --git a/ads/common/__init__.py b/ads/common/__init__.py index b88f4d130b86415b1be11840f0d18c75be1a9048..90a32a85c5470ac10249b297b78305379c57304e 100644 --- a/ads/common/__init__.py +++ b/ads/common/__init__.py @@ -10,6 +10,7 @@ from .ops.stride_add import npu_stride_add from .ops.transpose import npu_transpose from .ops.yolo_boxes_encode import npu_yolo_boxes_encode from .ops.scatter import npu_scatter +from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist from .ops.silu import npu_silu from .ops.silu import npu_silu_ from .ops.rotary_mul import npu_rotary_mul diff --git a/ads/common/ops/csrc/FurthestPointSamplingWithDistKernelNpu.cpp b/ads/common/ops/csrc/FurthestPointSamplingWithDistKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fea2f9486a311d5cb4742123f1a1d11db6b33b74 --- /dev/null +++ b/ads/common/ops/csrc/FurthestPointSamplingWithDistKernelNpu.cpp @@ -0,0 +1,30 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "OpApiCommon.h" +#include "functions.h" + +at::Tensor furthest_point_sampling_with_dist(const at::Tensor &points_dist, const at::Tensor &nearest_temp, const int32_t num_points) +{ + auto points_dist_size = points_dist.sizes(); + int64_t b = points_dist_size[0]; + int64_t num_points_real = num_points; + auto output_size = {b, num_points_real}; + at::Tensor result = at::empty(output_size, points_dist.options().dtype(at::kInt)); + EXEC_NPU_CMD(aclnnFurthestPointSamplingWithDist, points_dist, nearest_temp, num_points, result); + return result; +} diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h index 1384a1538dd30108b58583a791c79ad0bf2690b1..b8b930a0787578460a9a528bcf96cdff76ab9bda 100644 --- a/ads/common/ops/csrc/functions.h +++ b/ads/common/ops/csrc/functions.h @@ -114,6 +114,9 @@ at::Tensor npu_conv_transpose2d( at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups); + +at::Tensor furthest_point_sampling_with_dist(const at::Tensor &points_dist, const at::Tensor &nearest_temp, const int32_t num_points); + at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size); at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result); at::Tensor npu_moe_tutel( diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp index 5638ac6b09c986dba4e68e9fe99862475dbaa323..d9f153249d340ee6eb72ca32e763007ea64b773f 100644 --- a/ads/common/ops/csrc/pybind.cpp +++ b/ads/common/ops/csrc/pybind.cpp @@ -54,6 +54,9 @@ void init_common(pybind11::module &m) // npu_bounding_box_decode m.def("npu_bounding_box_decode", &npu_bounding_box_decode); + // furthest_points_sampling_with_dist + m.def("furthest_point_sampling_with_dist", &furthest_point_sampling_with_dist); + // npu_bounding_box_encode m.def("npu_bounding_box_encode", &npu_bounding_box_encode); diff --git a/ads/common/ops/furthest_point_sampling_with_dist.py b/ads/common/ops/furthest_point_sampling_with_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..164ce9f6256401e95ddd887790aff7640271dcd9 --- /dev/null +++ b/ads/common/ops/furthest_point_sampling_with_dist.py @@ -0,0 +1,17 @@ +import torch +from torch.autograd import Function +from torch.nn import Module + +import torch_npu +import ads_c + + +class AdsFurthestPointSamplingWithDistFunction(Function): + @staticmethod + def forward(ctx, points_dist, num_points): + B, N = points_dist.size()[:2] + nearest_temp = points_dist.new_zeros([B, N]).fill_(1e10) + result = ads_c.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points) + return result + +furthest_point_sample_with_dist = AdsFurthestPointSamplingWithDistFunction.apply diff --git a/ads/common/ops/kernels/ads_op/CMakePresets.json b/ads/common/ops/kernels/ads_op/CMakePresets.json index add05853f7195befe9689580c8aaf0e3f02ce381..a23c07b8bf823cc052ddf980a835408a9e3b918a 100644 --- a/ads/common/ops/kernels/ads_op/CMakePresets.json +++ b/ads/common/ops/kernels/ads_op/CMakePresets.json @@ -27,7 +27,7 @@ }, "ASCEND_COMPUTE_UNIT": { "type": "STRING", - "value": "ascend310p;ascend910;ascend910b" + "value": "ascend910b" }, "ENABLE_TEST": { "type": "BOOL", diff --git a/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist.cpp b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4fa290dc5c7137099626ce0350f716c2eb66c61c --- /dev/null +++ b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist.cpp @@ -0,0 +1,127 @@ +#include "furthest_point_sampling_with_dist_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +namespace optiling { +static ge::graphStatus TilingForFurthestPointSamplingWithDist(gert::TilingContext* context) +{ + FurthestPointSamplingWithDistTilingData tiling; + auto platformInfo = context->GetPlatformInfo(); + auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo); + static uint32_t core_num = ascendcPlatform.GetCoreNumAiv(); + uint64_t UB_size; + ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, UB_size); + + auto dist_shape = context->GetInputShape(0)->GetStorageShape(); + auto attrs = context->GetAttrs(); + + if (core_num == 0) { + return ge::GRAPH_FAILED; + } + + uint32_t points_num = *(attrs->GetAttrPointer(0)); + uint32_t b = dist_shape.GetDim(0); + uint32_t n = dist_shape.GetDim(1); + + auto dtype_str = context->GetInputDesc(0)->GetDataType(); + + uint32_t dtype_bytes = 4; + uint32_t ele_per_block = 8; + + uint32_t task_num = (b - 1) / core_num + 1; + uint32_t used_core_num = (b - 1) / task_num + 1; + uint32_t task_num_tail = b % task_num; + if (task_num_tail == 0) { + task_num_tail = task_num; + } + + uint32_t batch_dist_offset = n * n; + uint32_t batch_idx_offset = points_num; + + uint32_t part = 5 * dtype_bytes + 1; + uint32_t part_ub = (UB_size - 20 * 1024) / part / 32 * 32; + + uint32_t move_n_times = (n - 1) / part_ub + 1; + uint32_t n_tail = n % part_ub; + if (n_tail == 0) { + n_tail = part_ub; + } + uint32_t id_move_len = 1024; + uint32_t repeat_id_times = (points_num - 1) / id_move_len + 1; + uint32_t id_tail = points_num % id_move_len; + if (id_tail == 0) { + id_tail = id_move_len; + } + uint32_t work_size = 1024 * 2; + context->SetBlockDim(used_core_num); + + tiling.set_used_core_num(used_core_num); + tiling.set_points_num(points_num); + tiling.set_task_num(task_num); + tiling.set_task_num_tail(task_num_tail); + tiling.set_n(n); + tiling.set_batch_dist_offset(batch_dist_offset); + tiling.set_batch_idx_offset(batch_idx_offset); + tiling.set_part_ub(part_ub); + tiling.set_move_n_times(move_n_times); + tiling.set_n_tail(n_tail); + tiling.set_id_move_len(id_move_len); + tiling.set_repeat_id_times(repeat_id_times); + tiling.set_id_tail(id_tail); + tiling.set_work_size(work_size); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + + return ge::GRAPH_SUCCESS; +} +} + + +namespace ge { +static ge::graphStatus InferShape(gert::InferShapeContext* context) +{ + const gert::Shape* points_dist_shape = context->GetInputShape(0); + + auto attrs = context->GetAttrs(); + const int32_t* points_num = attrs->GetAttrPointer(0); + + gert::Shape* idx_shape = context->GetOutputShape(0); + idx_shape->AppendDim(points_dist_shape->GetDim(0)); + idx_shape->AppendDim(*points_num); + return GRAPH_SUCCESS; +} +} + + +namespace ops { +class FurthestPointSamplingWithDist : public OpDef { +public: + explicit FurthestPointSamplingWithDist(const char* name) : OpDef(name) + { + this->Input("points_dist") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("nearest_temp") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("index") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Attr("num_points").Int(); + + this->SetInferShape(ge::InferShape); + + this->AICore() + .SetTiling(optiling::TilingForFurthestPointSamplingWithDist); + this->AICore().AddConfig("ascend910b"); + } +}; + +OP_ADD(FurthestPointSamplingWithDist); +} diff --git a/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist_tiling.h b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..70c8191741bc64e5230d641fb368cc81c24fa512 --- /dev/null +++ b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist_tiling.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#ifndef FURTHEST_POINT_SAMPLING_WITH_DIST_TILING_H +#define FURTHEST_POINT_SAMPLING_WITH_DIST_TILING_H + +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(FurthestPointSamplingWithDistTilingData) + + TILING_DATA_FIELD_DEF(uint32_t, used_core_num) + TILING_DATA_FIELD_DEF(uint32_t, points_num) + TILING_DATA_FIELD_DEF(uint32_t, task_num) + TILING_DATA_FIELD_DEF(uint32_t, task_num_tail) + TILING_DATA_FIELD_DEF(uint32_t, n) + TILING_DATA_FIELD_DEF(uint32_t, batch_dist_offset) + TILING_DATA_FIELD_DEF(uint32_t, batch_idx_offset) + TILING_DATA_FIELD_DEF(uint32_t, part_ub) + TILING_DATA_FIELD_DEF(uint32_t, move_n_times) + TILING_DATA_FIELD_DEF(uint32_t, n_tail) + TILING_DATA_FIELD_DEF(uint32_t, id_move_len) + TILING_DATA_FIELD_DEF(uint32_t, repeat_id_times) + TILING_DATA_FIELD_DEF(uint32_t, id_tail) + TILING_DATA_FIELD_DEF(uint32_t, work_size) + +END_TILING_DATA_DEF + +REGISTER_TILING_DATA_CLASS(FurthestPointSamplingWithDist, FurthestPointSamplingWithDistTilingData) +} // namespace optiling + +#endif // FURTHEST_POINT_SAMPLING_WITH_DIST_TILING_H diff --git a/ads/common/ops/kernels/ads_op/op_kernel/furthest_point_sampling_with_dist.cpp b/ads/common/ops/kernels/ads_op/op_kernel/furthest_point_sampling_with_dist.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9cc5340a7121d7c092b4353a9f6eac27f81f4130 --- /dev/null +++ b/ads/common/ops/kernels/ads_op/op_kernel/furthest_point_sampling_with_dist.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. + * + */ +#include "kernel_operator.h" +#include "kernel_tiling/kernel_tiling.h" +#include "kernel_utils.h" + +using namespace AscendC; + +constexpr uint32_t BUFFER_NUM = 1u; +constexpr uint32_t BLOCK_BYTE_SIZE = 32u; + + +template +class KernelFurthestPointSamplingWithDist { +public: + __aicore__ inline KernelFurthestPointSamplingWithDist() = default; + __aicore__ inline void Init(GM_ADDR points_dist, GM_ADDR nearest_temp, GM_ADDR index, GM_ADDR workspace, + const FurthestPointSamplingWithDistTilingData* __restrict tiling); + __aicore__ inline void Process(); +private: + __aicore__ inline void init_tiling_value(); + __aicore__ inline void ProcessEachBatch(uint32_t batch_id); + __aicore__ inline void PointSampling(uint32_t id_times, uint32_t id_len); + __aicore__ inline void CalculatePartUb(uint32_t n_times, uint32_t n_len); + __aicore__ inline void CopyIn(uint32_t n_times, uint32_t n_len); + __aicore__ inline void Compute(uint32_t n_times, uint32_t n_len); + __aicore__ inline void CopyOutDist(uint32_t n_times, uint32_t n_len); + __aicore__ inline void CopyOut(uint32_t id_times, uint32_t id_len); + __aicore__ inline uint32_t CeilDiv(uint32_t x, uint32_t y); + +private: + TPipe pipe; + TQue points_dist_queue; + TQue temp_queue; + + TQue temp_out_queue; + TQue idx_queue; + + TBuf work_Buf; + TBuf dist_Buf; + + GlobalTensor points_dist_gm; + GlobalTensor idx_gm; + GlobalTensor temp_gm; + + uint32_t used_core_num {0}; + uint32_t points_num {0}; + uint32_t task_num {0}; + uint32_t task_num_tail {0}; + uint32_t n {0}; + uint32_t batch_dist_offset {0}; + uint32_t batch_idx_offset {0}; + uint32_t part_ub {0}; + uint32_t move_n_times {0}; + uint32_t n_tail {0}; + uint32_t repeat_id_times {0}; + uint32_t id_move_len {0}; + uint32_t id_tail {0}; + uint32_t work_size {0}; + + int32_t last_idx {-1}; + uint32_t batch_size {0}; + uint32_t data_type_size {0}; + uint32_t idx_type_size {0}; + uint32_t mask_type_size {0}; + uint32_t dist_begin_offset {0}; + uint32_t temp_begin_offset {0}; + uint32_t idx_begin_offset {0}; + uint32_t now_max_dim {0}; + uint32_t block_size {32}; + uint32_t block_per_size {0}; + float now_max_dist {0}; + + const FurthestPointSamplingWithDistTilingData* __restrict tiling_device {nullptr}; +}; + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::Init(GM_ADDR points_dist, GM_ADDR nearest_temp, GM_ADDR idx, GM_ADDR workspace, const FurthestPointSamplingWithDistTilingData* __restrict tiling) { + // init tiling + this->tiling_device = tiling; + init_tiling_value(); + + uint32_t core_id = GetBlockIdx(); + + uint32_t batch_begin_offset = core_id * task_num; + + // 判断是否为尾核 + bool is_last_core = (core_id == (used_core_num - 1)); + if (!is_last_core) { + batch_size = task_num; + } + else { + batch_size = task_num_tail; + } + + // calculate begin offset + int32_t gm_dist_begin_offset = batch_begin_offset * batch_dist_offset; + int32_t gm_temp_begin_offset = batch_begin_offset * n; + int32_t gm_idx_begin_offset = batch_begin_offset * batch_idx_offset; + + // set LocalTensor base addr + this->points_dist_gm.SetGlobalBuffer((__gm__ dataType *)points_dist + gm_dist_begin_offset, batch_size * batch_dist_offset); + this->temp_gm.SetGlobalBuffer((__gm__ dataType *)nearest_temp + gm_temp_begin_offset, batch_size * n); + this->idx_gm.SetGlobalBuffer((__gm__ idxType *)idx + gm_idx_begin_offset, batch_size * batch_idx_offset); + + data_type_size = sizeof(dataType); + idx_type_size = sizeof(idxType); + mask_type_size = sizeof(uint8_t); + block_per_size = block_size / data_type_size; + + this->pipe.InitBuffer(this->points_dist_queue, BUFFER_NUM, part_ub * data_type_size); + this->pipe.InitBuffer(this->temp_queue, BUFFER_NUM, part_ub * data_type_size); + + this->pipe.InitBuffer(this->temp_out_queue, BUFFER_NUM, part_ub * data_type_size); + this->pipe.InitBuffer(this->idx_queue, BUFFER_NUM, id_move_len * idx_type_size); + + this->pipe.InitBuffer(this->dist_Buf, block_size); + this->pipe.InitBuffer(this->work_Buf, work_size * data_type_size); +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::Process() { + for (uint32_t batch_id = 0; batch_id < batch_size; ++batch_id) { + last_idx = -1; + ProcessEachBatch(batch_id); + } +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::init_tiling_value() { + used_core_num = tiling_device->used_core_num; + points_num = tiling_device->points_num; + task_num = tiling_device->task_num; + task_num_tail = tiling_device->task_num_tail; + n = tiling_device->n; + batch_dist_offset = tiling_device->batch_dist_offset; + batch_idx_offset = tiling_device->batch_idx_offset; + part_ub = tiling_device->part_ub; + move_n_times = tiling_device->move_n_times; + n_tail = tiling_device->n_tail; + id_move_len = tiling_device->id_move_len; + repeat_id_times = tiling_device->repeat_id_times; + id_tail = tiling_device->id_tail; + work_size = tiling_device->work_size; +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::ProcessEachBatch(uint32_t batch_id) { + dist_begin_offset = batch_id * batch_dist_offset; + temp_begin_offset = batch_id * n; + idx_begin_offset = batch_id * batch_idx_offset; + + for (uint32_t id_times = 0; id_times < repeat_id_times - 1; ++id_times) { + PointSampling(id_times, id_move_len); + } + PointSampling(repeat_id_times - 1, id_tail); +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::PointSampling(uint32_t id_times, uint32_t id_len) { + LocalTensor idx_local = idx_queue.AllocTensor(); + + for (uint32_t i = 0; i < id_len; i++) { + now_max_dim = last_idx; + now_max_dist = 0; + for (uint32_t j = 0; j < move_n_times - 1; j++) { + CalculatePartUb(j, part_ub); + } + CalculatePartUb(move_n_times - 1, n_tail); + last_idx = now_max_dim; + idx_local.SetValue(i, now_max_dim); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + idx_queue.EnQue(idx_local); + CopyOut(id_times, id_len); +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::CalculatePartUb(uint32_t n_times, uint32_t n_len) { + CopyIn(n_times, n_len); + pipe_barrier(PIPE_ALL); + Compute(n_times, n_len); + CopyOutDist(n_times, n_len); +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::CopyIn(uint32_t n_times, uint32_t n_len) { + LocalTensor dist1_local = temp_queue.AllocTensor(); + LocalTensor dist2_local = points_dist_queue.AllocTensor(); + // calculate offset + uint32_t dist1_offset = temp_begin_offset + n_times * part_ub; + uint32_t dist2_offset = dist_begin_offset + last_idx * n + n_times * part_ub; + uint32_t move_len = CeilDiv(n_len, block_per_size) * block_per_size; + // data copy + DataCopy(dist1_local, temp_gm[dist1_offset], move_len); + if (last_idx == -1) { + DataCopy(dist2_local, temp_gm[dist1_offset], move_len); + } + else { + DataCopy(dist2_local, points_dist_gm[dist2_offset], move_len); + } + temp_queue.EnQue(dist1_local); + points_dist_queue.EnQue(dist2_local); +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::Compute(uint32_t n_times, uint32_t n_len) { + LocalTensor dist1_local = temp_queue.DeQue(); + LocalTensor dist2_local = points_dist_queue.DeQue(); + LocalTensor dist3_local = temp_out_queue.AllocTensor(); + + LocalTensor workLocal = work_Buf.Get(); + LocalTensor dstLocal = dist_Buf.Get(); + + Min(dist3_local, dist1_local, dist2_local, n_len); + // calculate reduce_max + ReduceMax(dstLocal, dist3_local, workLocal, n_len, true); + float dist = dstLocal.GetValue(0); + LocalTensor idx_int32 = dstLocal.template ReinterpretCast(); + int32_t idx = idx_int32.GetValue(1); + + if (dist > now_max_dist) { + now_max_dist = dist; + now_max_dim = idx + n_times * part_ub; + } + + // enque + temp_out_queue.EnQue(dist3_local); + + temp_queue.FreeTensor(dist1_local); + points_dist_queue.FreeTensor(dist2_local); +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::CopyOutDist(uint32_t n_times, uint32_t n_len) { + LocalTensor temp_local = temp_out_queue.DeQue(); + + uint32_t dist1_offset = temp_begin_offset + n_times * part_ub; + + DataCopyParams copyParams{1, (uint16_t)(n_len * sizeof(dataType)), 0, 0}; + DataCopyPad(temp_gm[dist1_offset], temp_local, copyParams); + + temp_out_queue.FreeTensor(temp_local); +} + +template +__aicore__ inline void KernelFurthestPointSamplingWithDist::CopyOut(uint32_t id_times, uint32_t id_len) { + LocalTensor idx_local = idx_queue.DeQue(); + + uint32_t idx_offset = idx_begin_offset + id_times * id_move_len; + + DataCopyParams copyParams{1, (uint16_t)(id_len * sizeof(idxType)), 0, 0}; + DataCopyPad(idx_gm[idx_offset], idx_local, copyParams); + idx_queue.FreeTensor(idx_local); +} + +template +__aicore__ inline uint32_t KernelFurthestPointSamplingWithDist::CeilDiv(uint32_t x, uint32_t y) { + return y == 0 ? x : (x + y - 1) / y; +} + +extern "C" __global__ __aicore__ void furthest_point_sampling_with_dist(GM_ADDR points_dist, GM_ADDR nearest_temp, GM_ADDR index, GM_ADDR workspace, GM_ADDR tiling) { + if (workspace == nullptr) { + return; + } + SetSysWorkspace(workspace); + GM_ADDR user_ws = GetUserWorkspace(workspace); + if (user_ws == nullptr) { + return; + } + + GET_TILING_DATA(tiling_data, tiling); + const FurthestPointSamplingWithDistTilingData* __restrict tiling_device = &tiling_data; + KernelFurthestPointSamplingWithDist op; + op.Init(points_dist, nearest_temp, index, user_ws, tiling_device); + op.Process(); +} diff --git a/ads/common/ops/kernels/inc/base.h b/ads/common/ops/kernels/inc/base.h index aa3b9cee948377c5ac16455a0f43c17f56d669ea..28987cf5691fb4df7f44e725fdc40e8a630eea67 100644 --- a/ads/common/ops/kernels/inc/base.h +++ b/ads/common/ops/kernels/inc/base.h @@ -2,4 +2,10 @@ // .INPUT(x1, TensorType({DT_FLOAT})) // .INPUT(x2, TensorType({DT_FLOAT})) // .OUTPUT(y, TensorType({DT_FLOAT})) -// .OP_END_FACTORY_REG(Add) \ No newline at end of file +// .OP_END_FACTORY_REG(Add) +// REG_OP(FurthestPointSamplingWithDist) +// .INPUT(points_dist, TensorType({DT_FLOAT})) +// .INPUT(nearest_temp, TensorType({DT_FLOAT})) +// .OUTPUT(index, TensorType({DT_INT32})) +// .REQUIRED_ATTR(num_points, Int) +// .OP_END_FACTORY_REG(FurthestPointSamplingWithDist) diff --git a/tests/test_furthest_point_sample_with_dist.py b/tests/test_furthest_point_sample_with_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..be2fb2d3e2285ebdf7710e9407a7cdff778dca9d --- /dev/null +++ b/tests/test_furthest_point_sample_with_dist.py @@ -0,0 +1,89 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import torch +import numpy as np + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +import ads.common + +DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] + + +class TestFurthestPointSampleWithDist(TestCase): + + def create_input_data(self, shape): + b, n = shape + point_xyz = np.random.uniform(0, 10, [b, n, 3]).astype(np.float32) + point_dist = np.zeros([b, n, n]).astype(np.float32) + for batch_id in range(b): + for src_id in range(n): + x1, y1, z1 = point_xyz[batch_id, src_id] + for dst_id in range(n): + x2, y2, z2 = point_xyz[batch_id, dst_id] + point_dist[batch_id, src_id, dst_id] = point_dist[batch_id, src_id, dst_id] =\ + (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2) + return point_dist + + def compare_min(self, a, b): + if a > b: + return b + else: + return a + + def supported_op_exec(self, point_dist, point_num): + b, n, _ = point_dist.shape + tmp = np.zeros([b, n]).astype(np.float32) + result_cpu = np.zeros([b, point_num]).astype(np.int32) + for batch in range(b): + for i in range(n): + tmp[batch, i] = point_dist[batch, 0, i] + for idx in range(1, point_num): + best = 0 + best_i = 0 + last_time_idx = result_cpu[batch, idx - 1] + for i in range(n): + tmp[batch, i] = self.compare_min(point_dist[batch, last_time_idx, i], tmp[batch, i]) + if(best < tmp[batch, i]): + best = tmp[batch, i] + best_i = i + result_cpu[batch, idx] = best_i + return result_cpu + + def custom_op_exec(self, point_dist, point_num, input_dtype): + point_dist_npu = torch.tensor(point_dist, dtype=input_dtype).npu() + output = ads.common.furthest_point_sample_with_dist(point_dist_npu, point_num) + return output.cpu().numpy() + + @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `FurthestPointSampleWithDist` is only supported on 910B, skip this ut!") + def test_FurthestPointSampleWithDist(self): + shape_list = [[4, 100], [30, 1000]] + point_num_list = [32, 1000] + dtype_list = [torch.float32, torch.float32] + for idx in range(2): + shape = shape_list[idx] + point_num = point_num_list[idx] + input_dtype = dtype_list[idx] + point_dist = self.create_input_data(shape) + + exoutput = self.supported_op_exec(point_dist, point_num) + + output = self.custom_op_exec(point_dist, point_num, input_dtype) + + self.assertRtolEqual(exoutput, output) + +if __name__ == "__main__": + run_tests() \ No newline at end of file