diff --git a/README.md b/README.md
index 813f1bce23cd61d357fd962687844d7e8a92d8a0..35988550e488d2e43b7ced87511d35d07d2aaa6c 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,94 @@
-## Description
+### 简介
 
-+ This repo provides some speed-up abilites for AI-models in autonomous driving system, which can help these models running fatser on ascend device.
+本项目基于昇腾NPU开发了用于自动驾驶场景的高性能算子
+
+### 编译、安装ADS
+
+#### 发布包安装
+
+暂未正式发布
+
+#### 源码安装
+
+**安装依赖**
+
+> 安装对应的版本的torch、torch_npu、cann包，具体配套关系见pytorch仓(https://gitee.com/ascend/pytorch)首页readme 
+>
+> 并source cann包环境变量
+
+##### 下载ADS
+
+```shell
+# 下载ads仓
+git clone https://gitee.com/ascend/ads.git
+```
+
+##### 编译ADS
+
+```shell
+# 编译
+bash ads/ci/build.sh --python=3.7
+```
+
+| 架构    | pytorch版本  | 出包版本                                                 |
+| ------- | ------------ | -------------------------------------------------------- |
+| x86     | pytorch1.11  | Python3.7(\>=3.7.5)， Python3.8， Python3.9， Python3.10 |
+| x86     | pytorch2.0.1 | Python3.8， Python3.9， Python3.10                       |
+| x86     | pytorch2.1.0 | Python3.8， Python3.9， Python3.10                       |
+| aarch64 | pytorch1.11  | Python3.7(\>=3.7.5)， Python3.8， Python3.9， Python3.10 |
+| aarch64 | pytorch2.0.1 | Python3.8， Python3.9， Python3.10                       |
+| aarch64 | pytorch2.1.0 | Python3.8， Python3.9， Python3.10                       |
+
+| 参数   | 取值范围                                                     | 说明                           | 缺省值 | 备注                                           |
+| ------ | ------------------------------------------------------------ | ------------------------------ | ------ | ---------------------------------------------- |
+| python | pytorch1.11，支持3.7及以上；pytorch1.11以上版本，支持3.8及以上 | 指定编译过程中使用的python版本 | 3.7    | 仅pytorch版本为1.11时才支持指定python版本为3.7 |
+
+##### 安装ADS
+
+```shell
+cd ads/dist
+pip3 install ads-1.0-cp37-cp37m-linux_aarch64.whl
+```
+
+#### CMC取包安装
+
+当前ADS包还未商发，需到https://cmc-szv.clouddragon.huawei.com/cmcversion/index/search 搜索 FrameworkPTAdapter V100R001C01B001 取最新的包即可，注意需要根据环境的torch版本和python版本选择下载，如 ADS_v1.11.0_py37.tar.gz，其中v1.11.0表示torch版本，py37表示python版本。
+
+![img](file:///C:/Users/c30030097/AppData/Roaming/eSpace_Desktop/UserData/c30030097/imagefiles/originalImgfiles/8A534D96-078D-44F1-852C-E0B8CD4F3074.png)
+
+后续计划发包版本
+
+| 架构    | pytorch版本  | 出包版本                                                 |
+| ------- | ------------ | -------------------------------------------------------- |
+| x86     | pytorch1.11  | Python3.7(\>=3.7.5)， Python3.8， Python3.9， Python3.10 |
+| x86     | pytorch2.0.1 | Python3.8， Python3.9， Python3.10                       |
+| x86     | pytorch2.1.0 | Python3.8， Python3.9， Python3.10                       |
+| aarch64 | pytorch1.11  | Python3.7(\>=3.7.5)， Python3.8， Python3.9， Python3.10 |
+| aarch64 | pytorch2.0.1 | Python3.8， Python3.9， Python3.10                       |
+| aarch64 | pytorch2.1.0 | Python3.8， Python3.9， Python3.10                       |
+
+### ADS算子调用
+
+##### 设置环境变量
+
+注意：其中xxx表示当前环境上的python安装路径
+
+```bash
+export ASCEND_CUSTOM_OPP_PATH=xxx/site-packages/ads/common/ops/kernels/ads_op_kernel/packages/vendors/customize/
+export LD_LIBRARY_PATH=xxx/site-packages/ads/common/ops/kernels/ads_op_kernel/packages/vendors/customize/op_api/lib/:$LD_LIBRARY_PATH
+```
+
+算子调用
+
+```python
+import torch
+import torch_npu
+import numpy as np
+import ads.common
+device = torch.device("npu:5")
+a=torch.rand([8, 2048]).half().npu()
+b=torch.rand([8, 2048]).half().npu()
+c = ads.common.npu_ads_add(a,b)
+print(c)
+```
 
-+ The main three parts is as follows:
-    + **common**, general modules, which contain some fused pytorch modules of which provide speed-up ability for models runnning on ascend device. 
-    + **motion**, the motion prediction modules, which are used for trace-planning and motion-prediction. The modules contain some pytorch-custom-apis, the kernels of which are affinitive on ascend device.
-    + **perception**, the perception modules, which are used for 3D detection and segmentation. It helps recognize the components on the road.
\ No newline at end of file
diff --git a/ads/common/__init__.py b/ads/common/__init__.py
index b88f4d130b86415b1be11840f0d18c75be1a9048..90a32a85c5470ac10249b297b78305379c57304e 100644
--- a/ads/common/__init__.py
+++ b/ads/common/__init__.py
@@ -10,6 +10,7 @@ from .ops.stride_add import npu_stride_add
 from .ops.transpose import npu_transpose
 from .ops.yolo_boxes_encode import npu_yolo_boxes_encode
 from .ops.scatter import npu_scatter
+from .ops.furthest_point_sampling_with_dist import furthest_point_sample_with_dist
 from .ops.silu import npu_silu
 from .ops.silu import npu_silu_
 from .ops.rotary_mul import npu_rotary_mul
diff --git a/ads/common/ops/csrc/FurthestPointSamplingWithDistKernelNpu.cpp b/ads/common/ops/csrc/FurthestPointSamplingWithDistKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fea2f9486a311d5cb4742123f1a1d11db6b33b74
--- /dev/null
+++ b/ads/common/ops/csrc/FurthestPointSamplingWithDistKernelNpu.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include "OpApiCommon.h"
+#include "functions.h"
+
+at::Tensor furthest_point_sampling_with_dist(const at::Tensor &points_dist, const at::Tensor &nearest_temp, const int32_t num_points)
+{
+    auto points_dist_size = points_dist.sizes();
+    int64_t b = points_dist_size[0];
+    int64_t num_points_real = num_points;
+    auto output_size = {b, num_points_real};
+    at::Tensor result = at::empty(output_size, points_dist.options().dtype(at::kInt));
+    EXEC_NPU_CMD(aclnnFurthestPointSamplingWithDist, points_dist, nearest_temp, num_points, result);
+    return result;
+}
diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h
index 1384a1538dd30108b58583a791c79ad0bf2690b1..b8b930a0787578460a9a528bcf96cdff76ab9bda 100644
--- a/ads/common/ops/csrc/functions.h
+++ b/ads/common/ops/csrc/functions.h
@@ -114,6 +114,9 @@ at::Tensor npu_conv_transpose2d(
     at::IntArrayRef stride,
     at::IntArrayRef dilation,
     int64_t groups);
+
+at::Tensor furthest_point_sampling_with_dist(const at::Tensor &points_dist, const at::Tensor &nearest_temp, const int32_t num_points);
+
 at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size);
 at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result);
 at::Tensor npu_moe_tutel(
diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp
index 5638ac6b09c986dba4e68e9fe99862475dbaa323..d9f153249d340ee6eb72ca32e763007ea64b773f 100644
--- a/ads/common/ops/csrc/pybind.cpp
+++ b/ads/common/ops/csrc/pybind.cpp
@@ -54,6 +54,9 @@ void init_common(pybind11::module &m)
     // npu_bounding_box_decode
     m.def("npu_bounding_box_decode", &npu_bounding_box_decode);
 
+    // furthest_points_sampling_with_dist
+    m.def("furthest_point_sampling_with_dist", &furthest_point_sampling_with_dist);
+    
     // npu_bounding_box_encode
     m.def("npu_bounding_box_encode", &npu_bounding_box_encode);
 
diff --git a/ads/common/ops/furthest_point_sampling_with_dist.py b/ads/common/ops/furthest_point_sampling_with_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..164ce9f6256401e95ddd887790aff7640271dcd9
--- /dev/null
+++ b/ads/common/ops/furthest_point_sampling_with_dist.py
@@ -0,0 +1,17 @@
+import torch
+from torch.autograd import Function
+from torch.nn import Module
+
+import torch_npu
+import ads_c
+
+
+class AdsFurthestPointSamplingWithDistFunction(Function):
+    @staticmethod
+    def forward(ctx, points_dist, num_points): 
+        B, N = points_dist.size()[:2]
+        nearest_temp = points_dist.new_zeros([B, N]).fill_(1e10)
+        result = ads_c.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points)
+        return result
+
+furthest_point_sample_with_dist = AdsFurthestPointSamplingWithDistFunction.apply
diff --git a/ads/common/ops/kernels/ads_op/CMakePresets.json b/ads/common/ops/kernels/ads_op/CMakePresets.json
index add05853f7195befe9689580c8aaf0e3f02ce381..a23c07b8bf823cc052ddf980a835408a9e3b918a 100644
--- a/ads/common/ops/kernels/ads_op/CMakePresets.json
+++ b/ads/common/ops/kernels/ads_op/CMakePresets.json
@@ -27,7 +27,7 @@
                 },
                 "ASCEND_COMPUTE_UNIT": {
                     "type": "STRING",
-                    "value": "ascend310p;ascend910;ascend910b"
+                    "value": "ascend910b"
                 },
                 "ENABLE_TEST": {
                     "type": "BOOL",
diff --git a/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist.cpp b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fa290dc5c7137099626ce0350f716c2eb66c61c
--- /dev/null
+++ b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist.cpp
@@ -0,0 +1,127 @@
+#include "furthest_point_sampling_with_dist_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace optiling {
+static ge::graphStatus TilingForFurthestPointSamplingWithDist(gert::TilingContext* context)
+{
+    FurthestPointSamplingWithDistTilingData tiling;
+    auto platformInfo = context->GetPlatformInfo();
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
+    static uint32_t core_num = ascendcPlatform.GetCoreNumAiv();
+    uint64_t UB_size;
+    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, UB_size);
+    
+    auto dist_shape = context->GetInputShape(0)->GetStorageShape();
+    auto attrs = context->GetAttrs();
+    
+    if (core_num == 0) {
+        return ge::GRAPH_FAILED;
+    }
+    
+    uint32_t points_num = *(attrs->GetAttrPointer<int32_t>(0));
+    uint32_t b = dist_shape.GetDim(0);
+    uint32_t n = dist_shape.GetDim(1);
+    
+    auto dtype_str = context->GetInputDesc(0)->GetDataType();
+
+    uint32_t dtype_bytes = 4;
+    uint32_t ele_per_block = 8;
+
+    uint32_t task_num = (b - 1) / core_num + 1;
+    uint32_t used_core_num = (b - 1) / task_num + 1;
+    uint32_t task_num_tail = b % task_num;
+    if (task_num_tail == 0) {
+        task_num_tail = task_num;
+    }
+
+    uint32_t batch_dist_offset = n * n;
+    uint32_t batch_idx_offset = points_num;
+
+    uint32_t part = 5 * dtype_bytes + 1;
+    uint32_t part_ub = (UB_size - 20 * 1024) / part / 32 * 32;
+
+    uint32_t move_n_times = (n - 1) / part_ub + 1;
+    uint32_t n_tail = n % part_ub;
+    if (n_tail == 0) {
+        n_tail = part_ub;
+    }
+    uint32_t id_move_len = 1024;
+    uint32_t repeat_id_times = (points_num - 1) / id_move_len + 1;
+    uint32_t id_tail = points_num % id_move_len;
+    if (id_tail == 0) {
+        id_tail = id_move_len;
+    }
+    uint32_t work_size = 1024 * 2;
+    context->SetBlockDim(used_core_num);
+
+    tiling.set_used_core_num(used_core_num);
+    tiling.set_points_num(points_num);
+    tiling.set_task_num(task_num);
+    tiling.set_task_num_tail(task_num_tail);
+    tiling.set_n(n);
+    tiling.set_batch_dist_offset(batch_dist_offset);
+    tiling.set_batch_idx_offset(batch_idx_offset);
+    tiling.set_part_ub(part_ub);
+    tiling.set_move_n_times(move_n_times);
+    tiling.set_n_tail(n_tail);
+    tiling.set_id_move_len(id_move_len);
+    tiling.set_repeat_id_times(repeat_id_times);
+    tiling.set_id_tail(id_tail);
+    tiling.set_work_size(work_size);
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+
+    return ge::GRAPH_SUCCESS;
+}
+}
+
+
+namespace ge {
+static ge::graphStatus InferShape(gert::InferShapeContext* context)
+{
+    const gert::Shape* points_dist_shape = context->GetInputShape(0);
+
+    auto attrs = context->GetAttrs();
+    const int32_t* points_num = attrs->GetAttrPointer<int32_t>(0);
+
+    gert::Shape* idx_shape = context->GetOutputShape(0);
+    idx_shape->AppendDim(points_dist_shape->GetDim(0));
+    idx_shape->AppendDim(*points_num);
+    return GRAPH_SUCCESS;
+}
+}
+
+
+namespace ops {
+class FurthestPointSamplingWithDist : public OpDef {
+public:
+    explicit FurthestPointSamplingWithDist(const char* name) : OpDef(name)
+    {
+        this->Input("points_dist")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Input("nearest_temp")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Output("index")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32})
+            .Format({ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND});
+        this->Attr("num_points").Int();
+
+        this->SetInferShape(ge::InferShape);
+
+        this->AICore()
+            .SetTiling(optiling::TilingForFurthestPointSamplingWithDist);
+        this->AICore().AddConfig("ascend910b");
+    }
+};
+
+OP_ADD(FurthestPointSamplingWithDist);
+}
diff --git a/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist_tiling.h b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..70c8191741bc64e5230d641fb368cc81c24fa512
--- /dev/null
+++ b/ads/common/ops/kernels/ads_op/op_host/furthest_point_sampling_with_dist_tiling.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#ifndef FURTHEST_POINT_SAMPLING_WITH_DIST_TILING_H
+#define FURTHEST_POINT_SAMPLING_WITH_DIST_TILING_H
+
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(FurthestPointSamplingWithDistTilingData)
+
+    TILING_DATA_FIELD_DEF(uint32_t, used_core_num)
+    TILING_DATA_FIELD_DEF(uint32_t, points_num)
+    TILING_DATA_FIELD_DEF(uint32_t, task_num)
+    TILING_DATA_FIELD_DEF(uint32_t, task_num_tail)
+    TILING_DATA_FIELD_DEF(uint32_t, n)
+    TILING_DATA_FIELD_DEF(uint32_t, batch_dist_offset)
+    TILING_DATA_FIELD_DEF(uint32_t, batch_idx_offset)
+    TILING_DATA_FIELD_DEF(uint32_t, part_ub)
+    TILING_DATA_FIELD_DEF(uint32_t, move_n_times)
+    TILING_DATA_FIELD_DEF(uint32_t, n_tail)
+    TILING_DATA_FIELD_DEF(uint32_t, id_move_len)
+    TILING_DATA_FIELD_DEF(uint32_t, repeat_id_times)
+    TILING_DATA_FIELD_DEF(uint32_t, id_tail)
+    TILING_DATA_FIELD_DEF(uint32_t, work_size)
+
+END_TILING_DATA_DEF
+
+REGISTER_TILING_DATA_CLASS(FurthestPointSamplingWithDist, FurthestPointSamplingWithDistTilingData)
+} // namespace optiling
+
+#endif // FURTHEST_POINT_SAMPLING_WITH_DIST_TILING_H
diff --git a/ads/common/ops/kernels/ads_op/op_kernel/furthest_point_sampling_with_dist.cpp b/ads/common/ops/kernels/ads_op/op_kernel/furthest_point_sampling_with_dist.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9cc5340a7121d7c092b4353a9f6eac27f81f4130
--- /dev/null
+++ b/ads/common/ops/kernels/ads_op/op_kernel/furthest_point_sampling_with_dist.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+ *
+ */
+#include "kernel_operator.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "kernel_utils.h"
+
+using namespace AscendC;
+
+constexpr uint32_t BUFFER_NUM = 1u;
+constexpr uint32_t BLOCK_BYTE_SIZE = 32u;
+
+
+template <typename dataType, typename idxType>
+class KernelFurthestPointSamplingWithDist {
+public:
+    __aicore__ inline KernelFurthestPointSamplingWithDist() = default;
+    __aicore__ inline void Init(GM_ADDR points_dist, GM_ADDR nearest_temp, GM_ADDR index, GM_ADDR workspace,
+                            const FurthestPointSamplingWithDistTilingData* __restrict tiling);
+    __aicore__ inline void Process();
+private:
+    __aicore__ inline void init_tiling_value();
+    __aicore__ inline void ProcessEachBatch(uint32_t batch_id);
+    __aicore__ inline void PointSampling(uint32_t id_times, uint32_t id_len);
+    __aicore__ inline void CalculatePartUb(uint32_t n_times, uint32_t n_len);
+    __aicore__ inline void CopyIn(uint32_t n_times, uint32_t n_len);
+    __aicore__ inline void Compute(uint32_t n_times, uint32_t n_len);
+    __aicore__ inline void CopyOutDist(uint32_t n_times, uint32_t n_len);
+    __aicore__ inline void CopyOut(uint32_t id_times, uint32_t id_len);
+    __aicore__ inline uint32_t CeilDiv(uint32_t x, uint32_t y);
+    
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> points_dist_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> temp_queue;
+
+    TQue<QuePosition::VECOUT, BUFFER_NUM> temp_out_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> idx_queue;
+
+    TBuf<TPosition::VECCALC> work_Buf;
+    TBuf<TPosition::VECCALC> dist_Buf;
+
+    GlobalTensor<dataType> points_dist_gm;
+    GlobalTensor<idxType> idx_gm;
+    GlobalTensor<dataType> temp_gm;
+
+    uint32_t used_core_num {0};
+    uint32_t points_num {0};
+    uint32_t task_num {0};
+    uint32_t task_num_tail {0};
+    uint32_t n {0};
+    uint32_t batch_dist_offset {0};
+    uint32_t batch_idx_offset {0};
+    uint32_t part_ub {0};
+    uint32_t move_n_times {0};
+    uint32_t n_tail {0};
+    uint32_t repeat_id_times {0};
+    uint32_t id_move_len {0};
+    uint32_t id_tail {0};
+    uint32_t work_size {0};
+
+    int32_t last_idx {-1};
+    uint32_t batch_size {0};
+    uint32_t data_type_size {0};
+    uint32_t idx_type_size {0};
+    uint32_t mask_type_size {0};
+    uint32_t dist_begin_offset {0};
+    uint32_t temp_begin_offset {0};
+    uint32_t idx_begin_offset {0};
+    uint32_t now_max_dim {0};
+    uint32_t block_size {32};
+    uint32_t block_per_size {0};
+    float now_max_dist {0};
+
+    const FurthestPointSamplingWithDistTilingData* __restrict tiling_device {nullptr};
+};
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::Init(GM_ADDR points_dist, GM_ADDR nearest_temp, GM_ADDR idx, GM_ADDR workspace, const FurthestPointSamplingWithDistTilingData* __restrict tiling) {
+    // init tiling
+    this->tiling_device = tiling;
+    init_tiling_value();
+
+    uint32_t core_id = GetBlockIdx();
+
+    uint32_t batch_begin_offset = core_id * task_num;
+
+    // 判断是否为尾核
+    bool is_last_core = (core_id == (used_core_num - 1));
+    if (!is_last_core) {
+        batch_size = task_num;
+    }
+    else {
+        batch_size = task_num_tail;
+    }
+
+    // calculate begin offset
+    int32_t gm_dist_begin_offset = batch_begin_offset * batch_dist_offset;
+    int32_t gm_temp_begin_offset = batch_begin_offset * n;
+    int32_t gm_idx_begin_offset = batch_begin_offset * batch_idx_offset;
+    
+    // set LocalTensor base addr
+    this->points_dist_gm.SetGlobalBuffer((__gm__ dataType *)points_dist + gm_dist_begin_offset, batch_size * batch_dist_offset);
+    this->temp_gm.SetGlobalBuffer((__gm__ dataType *)nearest_temp + gm_temp_begin_offset, batch_size * n);
+    this->idx_gm.SetGlobalBuffer((__gm__ idxType *)idx + gm_idx_begin_offset, batch_size * batch_idx_offset);
+
+    data_type_size = sizeof(dataType);
+    idx_type_size = sizeof(idxType);
+    mask_type_size = sizeof(uint8_t);
+    block_per_size = block_size / data_type_size;
+
+    this->pipe.InitBuffer(this->points_dist_queue, BUFFER_NUM, part_ub * data_type_size);
+    this->pipe.InitBuffer(this->temp_queue, BUFFER_NUM, part_ub * data_type_size);
+    
+    this->pipe.InitBuffer(this->temp_out_queue, BUFFER_NUM, part_ub * data_type_size);
+    this->pipe.InitBuffer(this->idx_queue, BUFFER_NUM, id_move_len * idx_type_size);
+
+    this->pipe.InitBuffer(this->dist_Buf, block_size);
+    this->pipe.InitBuffer(this->work_Buf, work_size * data_type_size);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::Process() {
+    for (uint32_t batch_id = 0; batch_id < batch_size; ++batch_id) {
+        last_idx = -1;
+        ProcessEachBatch(batch_id);
+    }
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::init_tiling_value() {
+    used_core_num = tiling_device->used_core_num;
+    points_num = tiling_device->points_num;
+    task_num = tiling_device->task_num;
+    task_num_tail = tiling_device->task_num_tail;
+    n = tiling_device->n;
+    batch_dist_offset = tiling_device->batch_dist_offset;
+    batch_idx_offset = tiling_device->batch_idx_offset;
+    part_ub = tiling_device->part_ub;
+    move_n_times = tiling_device->move_n_times;
+    n_tail = tiling_device->n_tail;
+    id_move_len = tiling_device->id_move_len;
+    repeat_id_times = tiling_device->repeat_id_times;
+    id_tail = tiling_device->id_tail;
+    work_size = tiling_device->work_size;
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::ProcessEachBatch(uint32_t batch_id) {
+    dist_begin_offset = batch_id * batch_dist_offset;
+    temp_begin_offset = batch_id * n;
+    idx_begin_offset = batch_id * batch_idx_offset;
+    
+    for (uint32_t id_times = 0; id_times < repeat_id_times - 1; ++id_times) {
+        PointSampling(id_times, id_move_len);
+    }
+    PointSampling(repeat_id_times - 1, id_tail);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::PointSampling(uint32_t id_times, uint32_t id_len) {
+    LocalTensor<idxType> idx_local = idx_queue.AllocTensor<idxType>();
+
+    for (uint32_t i = 0; i < id_len; i++) {
+        now_max_dim = last_idx;
+        now_max_dist = 0;
+        for (uint32_t j = 0; j < move_n_times - 1; j++) {
+            CalculatePartUb(j, part_ub);
+        }
+        CalculatePartUb(move_n_times - 1, n_tail);
+        last_idx = now_max_dim;
+        idx_local.SetValue(i, now_max_dim);
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    }
+    idx_queue.EnQue<idxType>(idx_local);
+    CopyOut(id_times, id_len);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::CalculatePartUb(uint32_t n_times, uint32_t n_len) {
+    CopyIn(n_times, n_len);
+    pipe_barrier(PIPE_ALL);
+    Compute(n_times, n_len);
+    CopyOutDist(n_times, n_len);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::CopyIn(uint32_t n_times, uint32_t n_len) {
+    LocalTensor<dataType> dist1_local = temp_queue.AllocTensor<dataType>();
+    LocalTensor<dataType> dist2_local = points_dist_queue.AllocTensor<dataType>();
+    // calculate offset
+    uint32_t dist1_offset = temp_begin_offset + n_times * part_ub;
+    uint32_t dist2_offset = dist_begin_offset + last_idx * n + n_times * part_ub;
+    uint32_t move_len = CeilDiv(n_len, block_per_size) * block_per_size;
+    // data copy
+    DataCopy(dist1_local, temp_gm[dist1_offset], move_len);
+    if (last_idx == -1) {
+        DataCopy(dist2_local, temp_gm[dist1_offset], move_len);
+    }
+    else {
+        DataCopy(dist2_local, points_dist_gm[dist2_offset], move_len);
+    }
+    temp_queue.EnQue<dataType>(dist1_local);
+    points_dist_queue.EnQue<dataType>(dist2_local);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::Compute(uint32_t n_times, uint32_t n_len) {
+    LocalTensor<dataType> dist1_local = temp_queue.DeQue<dataType>();
+    LocalTensor<dataType> dist2_local = points_dist_queue.DeQue<dataType>();
+    LocalTensor<dataType> dist3_local = temp_out_queue.AllocTensor<dataType>();
+
+    LocalTensor<dataType> workLocal = work_Buf.Get<dataType>();
+    LocalTensor<dataType> dstLocal = dist_Buf.Get<dataType>();
+
+    Min(dist3_local, dist1_local, dist2_local, n_len);
+    // calculate reduce_max
+    ReduceMax<dataType>(dstLocal, dist3_local, workLocal, n_len, true);
+    float dist = dstLocal.GetValue(0);
+    LocalTensor<uint32_t> idx_int32 = dstLocal.template ReinterpretCast<uint32_t>();
+    int32_t idx = idx_int32.GetValue(1);
+    
+    if (dist > now_max_dist) {
+        now_max_dist = dist;
+        now_max_dim = idx + n_times * part_ub;
+    }
+    
+    // enque
+    temp_out_queue.EnQue<dataType>(dist3_local);
+
+    temp_queue.FreeTensor(dist1_local);
+    points_dist_queue.FreeTensor(dist2_local);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::CopyOutDist(uint32_t n_times, uint32_t n_len) {
+    LocalTensor<dataType> temp_local = temp_out_queue.DeQue<dataType>();
+
+    uint32_t dist1_offset = temp_begin_offset + n_times * part_ub;
+
+    DataCopyParams copyParams{1, (uint16_t)(n_len * sizeof(dataType)), 0, 0};
+    DataCopyPad(temp_gm[dist1_offset], temp_local, copyParams);
+    
+    temp_out_queue.FreeTensor(temp_local);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline void KernelFurthestPointSamplingWithDist<dataType, idxType>::CopyOut(uint32_t id_times, uint32_t id_len) {
+    LocalTensor<idxType> idx_local = idx_queue.DeQue<idxType>();
+
+    uint32_t idx_offset = idx_begin_offset + id_times * id_move_len;
+
+    DataCopyParams copyParams{1, (uint16_t)(id_len * sizeof(idxType)), 0, 0};
+    DataCopyPad(idx_gm[idx_offset], idx_local, copyParams);
+    idx_queue.FreeTensor(idx_local);
+}
+
+template<typename dataType, typename idxType>
+__aicore__ inline uint32_t KernelFurthestPointSamplingWithDist<dataType, idxType>::CeilDiv(uint32_t x, uint32_t y) {
+    return y == 0 ? x : (x + y - 1) / y;
+}
+
+extern "C" __global__ __aicore__ void furthest_point_sampling_with_dist(GM_ADDR points_dist, GM_ADDR nearest_temp, GM_ADDR index, GM_ADDR workspace, GM_ADDR tiling) {
+    if (workspace == nullptr) {
+        return;
+    }
+    SetSysWorkspace(workspace);
+    GM_ADDR user_ws = GetUserWorkspace(workspace);
+    if (user_ws == nullptr) {
+        return;
+    }
+    
+    GET_TILING_DATA(tiling_data, tiling);
+    const FurthestPointSamplingWithDistTilingData* __restrict tiling_device = &tiling_data;
+    KernelFurthestPointSamplingWithDist<float, int32_t> op;
+    op.Init(points_dist, nearest_temp, index, user_ws, tiling_device);
+    op.Process();
+}
diff --git a/ads/common/ops/kernels/inc/base.h b/ads/common/ops/kernels/inc/base.h
index aa3b9cee948377c5ac16455a0f43c17f56d669ea..28987cf5691fb4df7f44e725fdc40e8a630eea67 100644
--- a/ads/common/ops/kernels/inc/base.h
+++ b/ads/common/ops/kernels/inc/base.h
@@ -2,4 +2,10 @@
 //     .INPUT(x1, TensorType({DT_FLOAT}))
 //     .INPUT(x2, TensorType({DT_FLOAT}))
 //     .OUTPUT(y, TensorType({DT_FLOAT}))
-//     .OP_END_FACTORY_REG(Add)
\ No newline at end of file
+//     .OP_END_FACTORY_REG(Add)
+// REG_OP(FurthestPointSamplingWithDist)
+//     .INPUT(points_dist, TensorType({DT_FLOAT}))
+//     .INPUT(nearest_temp, TensorType({DT_FLOAT}))
+//     .OUTPUT(index, TensorType({DT_INT32}))
+//     .REQUIRED_ATTR(num_points, Int)
+//     .OP_END_FACTORY_REG(FurthestPointSamplingWithDist)
diff --git a/tests/test_furthest_point_sample_with_dist.py b/tests/test_furthest_point_sample_with_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..be2fb2d3e2285ebdf7710e9407a7cdff778dca9d
--- /dev/null
+++ b/tests/test_furthest_point_sample_with_dist.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import torch
+import numpy as np
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import ads.common
+
+DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+
+
+class TestFurthestPointSampleWithDist(TestCase):
+
+    def create_input_data(self, shape):
+        b, n = shape
+        point_xyz = np.random.uniform(0, 10, [b, n, 3]).astype(np.float32)
+        point_dist = np.zeros([b, n, n]).astype(np.float32)
+        for batch_id in range(b):
+            for src_id in range(n):
+                x1, y1, z1 = point_xyz[batch_id, src_id]
+                for dst_id in range(n):
+                    x2, y2, z2 = point_xyz[batch_id, dst_id]
+                    point_dist[batch_id, src_id, dst_id] = point_dist[batch_id, src_id, dst_id] =\
+                    (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2)
+        return point_dist
+    
+    def compare_min(self, a, b):
+        if a > b:
+            return b
+        else:
+            return a
+    
+    def supported_op_exec(self, point_dist, point_num):
+        b, n, _ = point_dist.shape
+        tmp = np.zeros([b, n]).astype(np.float32)
+        result_cpu = np.zeros([b, point_num]).astype(np.int32)
+        for batch in range(b):
+            for i in range(n):
+                tmp[batch, i] = point_dist[batch, 0, i]
+            for idx in range(1, point_num):
+                best = 0
+                best_i = 0
+                last_time_idx = result_cpu[batch, idx - 1]
+                for i in range(n):
+                    tmp[batch, i] = self.compare_min(point_dist[batch, last_time_idx, i], tmp[batch, i])
+                    if(best < tmp[batch, i]):
+                        best = tmp[batch, i]
+                        best_i = i
+                result_cpu[batch, idx] = best_i
+        return result_cpu
+
+    def custom_op_exec(self, point_dist, point_num, input_dtype):
+        point_dist_npu = torch.tensor(point_dist, dtype=input_dtype).npu()
+        output = ads.common.furthest_point_sample_with_dist(point_dist_npu, point_num)
+        return output.cpu().numpy()
+    
+    @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `FurthestPointSampleWithDist` is only supported on 910B, skip this ut!")   
+    def test_FurthestPointSampleWithDist(self):
+        shape_list = [[4, 100], [30, 1000]]
+        point_num_list = [32, 1000]
+        dtype_list = [torch.float32, torch.float32]
+        for idx in range(2):
+            shape = shape_list[idx]
+            point_num = point_num_list[idx]
+            input_dtype = dtype_list[idx]
+            point_dist = self.create_input_data(shape)
+
+            exoutput = self.supported_op_exec(point_dist, point_num)
+
+            output = self.custom_op_exec(point_dist, point_num, input_dtype)
+
+            self.assertRtolEqual(exoutput, output)
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file