diff --git a/ads/common/__init__.py b/ads/common/__init__.py
index c4dcc62e68706731525794c75d57c26786658613..8e59829b5732c061d84fec33f3401d83ca9257a3 100644
--- a/ads/common/__init__.py
+++ b/ads/common/__init__.py
@@ -14,3 +14,10 @@ from .ops.silu import npu_silu
 from .ops.silu import npu_silu_
 from .ops.rotary_mul import npu_rotary_mul
 from .ops.npu_abs import npu_abs
+from .ops.fast_gelu import fast_gelu
+from .ops.npu_anchor_response_flags import npu_anchor_response_flags
+from .ops.npu_bounding_box_decode import npu_bounding_box_decode
+from .ops.npu_bounding_box_encode import npu_bounding_box_encode
+from .ops.npu_batch_nms import npu_batch_nms
+from .ops.npu_confusion_transpose import npu_confusion_transpose
+from .ops.npu_broadcast import npu_broadcast
diff --git a/ads/common/ops/csrc/AnchorResponseFlagsKernelNpu.cpp b/ads/common/ops/csrc/AnchorResponseFlagsKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f414633cc6ba78e6a9fdb8cdb80dceb64f23c8e4
--- /dev/null
+++ b/ads/common/ops/csrc/AnchorResponseFlagsKernelNpu.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/OpCommand.h"
+#include "common.h"
+
+namespace {
+c10::SmallVector<int64_t, SIZE> infersize_npu_anchor_response_flags(
+    at::IntArrayRef featmap_size,
+    int64_t num_base_anchors)
+{
+    int64_t output_value = featmap_size[0] * featmap_size[1] * num_base_anchors;
+    c10::SmallVector<int64_t, SIZE> output_size = {output_value};
+    return output_size;
+}
+
+inline void anchor_response_flags_check(
+    const at::Tensor& self,
+    at::IntArrayRef featmap_size,
+    at::IntArrayRef stride)
+{
+    TORCH_CHECK(
+        featmap_size.size() == 2,
+        "expected feat_map_size equals to 2, but got size ",
+        featmap_size.size());
+    TORCH_CHECK(
+        self.dim() == 2 && self.size(1) == 4,
+        "Non-empty 2D gt_bboxes tensor expected but got a tensor with sizes ",
+        self.sizes());
+    TORCH_CHECK(
+        self.scalar_type() == at::kHalf || self.scalar_type() == at::kFloat,
+        "float16 or float32 tensor expected but got a tensor with dtype: ",
+        self.scalar_type());
+}
+} // namespace
+
+at::Tensor npu_anchor_response_flags(
+    const at::Tensor& self,
+    at::IntArrayRef featmap_size,
+    at::IntArrayRef stride,
+    int64_t num_base_anchors)
+{
+    anchor_response_flags_check(self, featmap_size, stride);
+    auto output_size = infersize_npu_anchor_response_flags(featmap_size, num_base_anchors);
+    auto options = self.options().dtype(at::kByte);
+    at::Tensor result = at::empty(output_size, options);
+
+    at::Tensor self_cp = self.to(at::kFloat);
+
+    at_npu::native::OpCommand cmd;
+    cmd.Name("AnchorResponseFlags")
+        .Input(self_cp)
+        .Output(result)
+        .Attr("featmap_size", featmap_size)
+        .Attr("strides", stride)
+        .Attr("num_base_anchors", num_base_anchors)
+        .Run();
+    return result;
+}
diff --git a/ads/common/ops/csrc/BatchNms.cpp b/ads/common/ops/csrc/BatchNms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7051437f9fedc0279108ff783e9637499ee6e74
--- /dev/null
+++ b/ads/common/ops/csrc/BatchNms.cpp
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/OpCommand.h"
+#include "common.h"
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_batch_nms(
+    const at::Tensor& self,
+    const at::Tensor& scores,
+    double score_threshold,
+    double iou_threshold,
+    int64_t max_size_per_class,
+    int64_t max_total_size,
+    bool change_coordinate_frame,
+    bool transpose_box)
+{
+    at::Tensor nmsed_boxes = at::empty({self.size(0), max_total_size, 4}, self.options());
+    at::Tensor nmsed_scores = at::empty({self.size(0), max_total_size}, self.options());
+    at::Tensor nmsed_classes = at::empty({self.size(0), max_total_size}, self.options());
+    at::Tensor nmsed_num = at::empty({self.size(0)}, self.options().dtype(at::kInt));
+    at_npu::native::OpCommand cmd;
+    cmd.Name("BatchMultiClassNonMaxSuppression")
+        .Input(self)
+        .Input(scores)
+        .Output(nmsed_boxes)
+        .Output(nmsed_scores)
+        .Output(nmsed_classes)
+        .Output(nmsed_num)
+        .Attr("score_threshold", static_cast<float>(score_threshold))
+        .Attr("iou_threshold", static_cast<float>(iou_threshold))
+        .Attr("max_size_per_class", max_size_per_class)
+        .Attr("max_total_size", max_total_size)
+        .Attr("change_coordinate_frame", change_coordinate_frame)
+        .Attr("transpose_box", transpose_box)
+        .Run();
+    return std::tie(nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num);
+}
diff --git a/ads/common/ops/csrc/BoundingBoxDecodeKernelNpu.cpp b/ads/common/ops/csrc/BoundingBoxDecodeKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..85fc07643d9059a23d6a549f08451f5cd5cbdaac
--- /dev/null
+++ b/ads/common/ops/csrc/BoundingBoxDecodeKernelNpu.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/OpCommand.h"
+#include "common.h"
+
+at::Tensor npu_bounding_box_decode(
+    const at::Tensor& rois,
+    const at::Tensor& deltas,
+    double means0,
+    double means1,
+    double means2,
+    double means3,
+    double stds0,
+    double stds1,
+    double stds2,
+    double stds3,
+    at::IntArrayRef max_shape,
+    double wh_ratio_clip)
+{
+    c10::SmallVector<int64_t, SIZE> output_size = {rois.size(0), 4};
+    at::Tensor result = at::empty(output_size, rois.options());
+    c10::SmallVector<float, SIZE> means = {
+        static_cast<float>(means0),
+        static_cast<float>(means1),
+        static_cast<float>(means2),
+        static_cast<float>(means3)};
+    c10::SmallVector<float, SIZE> stds = {
+        static_cast<float>(stds0),
+        static_cast<float>(stds1),
+        static_cast<float>(stds2),
+        static_cast<float>(stds3)};
+    at_npu::native::OpCommand cmd;
+    cmd.Name("BoundingBoxDecode")
+        .Input(rois)
+        .Input(deltas)
+        .Output(result)
+        .Attr("means", means)
+        .Attr("stds", stds)
+        .Attr("max_shape", max_shape)
+        .Attr("wh_ratio_clip", static_cast<float>(wh_ratio_clip))
+        .Run();
+    return result;
+}
diff --git a/ads/common/ops/csrc/BoundingBoxEncodeKernelNpu.cpp b/ads/common/ops/csrc/BoundingBoxEncodeKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa5bad77d2a6b5f8cc3ffc63b917a952b1f97eec
--- /dev/null
+++ b/ads/common/ops/csrc/BoundingBoxEncodeKernelNpu.cpp
@@ -0,0 +1,51 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/OpCommand.h"
+#include "common.h"
+
+at::Tensor npu_bounding_box_encode(
+    const at::Tensor& anchor_box,
+    const at::Tensor& ground_truth_box,
+    double means0,
+    double means1,
+    double means2,
+    double means3,
+    double stds0,
+    double stds1,
+    double stds2,
+    double stds3)
+{
+    at::Tensor result = at::empty({anchor_box.size(0), 4}, anchor_box.options());
+    c10::SmallVector<float, SIZE> means = {
+        static_cast<float>(means0),
+        static_cast<float>(means1),
+        static_cast<float>(means2),
+        static_cast<float>(means3)};
+    c10::SmallVector<float, SIZE> stds = {
+        static_cast<float>(stds0),
+        static_cast<float>(stds1),
+        static_cast<float>(stds2),
+        static_cast<float>(stds3)};
+    at_npu::native::OpCommand cmd;
+    cmd.Name("BoundingBoxEncode")
+        .Input(anchor_box)
+        .Input(ground_truth_box)
+        .Output(result)
+        .Attr("means", means)
+        .Attr("stds", stds)
+        .Run();
+    return result;
+}
diff --git a/ads/common/ops/csrc/BroadCastKernelNpu.cpp b/ads/common/ops/csrc/BroadCastKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b48580342553a5c1b2711dff770b24ee38fe13e1
--- /dev/null
+++ b/ads/common/ops/csrc/BroadCastKernelNpu.cpp
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+
+namespace {
+at::Tensor& npu_broadcast_out_nocheck(at::Tensor& result, const at::Tensor& self, at::IntArrayRef size)
+{
+    at_npu::native::OpCommand cmd;
+    cmd.Name("BroadcastTo")
+        .Input(self)
+        .Input(size)
+        .Output(result)
+        .Run();
+    return result;
+}
+} // namespace
+
+at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result)
+{
+    npu_broadcast_out_nocheck(result, self, size);
+
+    return result;
+}
+
+at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size)
+{
+    at::Tensor self_cp = self.dtype() == at::kBool ? self.to(at::kInt) : self;
+    at::Tensor result = at::empty(size, self_cp.options());
+    npu_broadcast_out_nocheck(result, self_cp, size);
+
+    if (self.dtype() == at::kBool) {
+        result = result.to(at::kBool);
+    }
+    return result;
+}
diff --git a/ads/common/ops/csrc/ConfusionTransposeKernelNpu.cpp b/ads/common/ops/csrc/ConfusionTransposeKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a12d1d7c87d2be696e4e89adbea4dca5362836d9
--- /dev/null
+++ b/ads/common/ops/csrc/ConfusionTransposeKernelNpu.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/OpCommand.h"
+#include "common.h"
+
+at::Tensor npu_confusion_transpose(
+    const at::Tensor& self,
+    at::IntArrayRef perm,
+    at::IntArrayRef shape,
+    bool transpose_first)
+{
+    c10::SmallVector<int64_t, SIZE> output_size;
+    if (transpose_first) {
+        output_size = array_to_small_vector(shape);
+    } else {
+        auto shape_size = shape.size();
+        for (uint i = 0; i < perm.size(); i++) {
+            TORCH_CHECK(shape_size > perm[i], "npu_confusion_transpose input invalid, "
+                                            "shape has size ",
+                        shape_size, " but perm[i] is, ", perm[i]);
+            output_size.emplace_back(shape[perm[i]]);
+        }
+    }
+
+    at::Tensor result = at::empty(output_size, self.options());
+    at_npu::native::OpCommand cmd;
+    cmd.Name("ConfusionTransposeD")
+        .Input(self)
+        .Output(result)
+        .Attr("perm", perm)
+        .Attr("shape", shape)
+        .Attr("transpose_first", transpose_first)
+        .Run();
+
+    return result;
+}
+
+void check_confusion_transpose_perm(at::IntArrayRef perm, at::IntArrayRef shape)
+{
+    auto input_dim = shape.size();
+    TORCH_CHECK(perm.size() == input_dim, "The length of perm should be the same as shape.");
+    std::vector<bool> seen(input_dim);
+    for (const auto i : c10::irange(input_dim)) {
+        auto dim = at::maybe_wrap_dim(perm[i], input_dim);
+        TORCH_CHECK(!seen[dim], "Repeated dim in perm");
+        seen[dim] = true;
+    }
+}
+
+at::Tensor npu_confusion_transpose_backward(
+    const at::Tensor& grad,
+    at::IntArrayRef perm,
+    at::IntArrayRef shape,
+    bool transpose_first)
+{
+    c10::SmallVector<int64_t, SIZE> svec_shape;
+    if (transpose_first) {
+        svec_shape = array_to_small_vector(shape);
+    } else {
+        check_confusion_transpose_perm(perm, shape);
+        for (int i = 0; i < perm.size(); i++) {
+            svec_shape.emplace_back(shape[perm[i]]);
+        }
+    }
+    std::vector<int64_t> vec_perm;
+    int64_t perm_len = perm.size();
+    int64_t temp_perm[perm_len] = {0};
+    for (int64_t i = 0; i < perm_len; i++) {
+        temp_perm[perm[i]] = i;
+    }
+    vec_perm = std::vector<int64_t>(temp_perm, temp_perm+perm_len);
+    perm = at::IntArrayRef(vec_perm);
+    at::Tensor result = at::empty(shape, grad.options());
+
+    at_npu::native::OpCommand cmd;
+    cmd.Name("ConfusionTransposeD")
+        .Input(grad)
+        .Output(result)
+        .Attr("perm", perm)
+        .Attr("shape", svec_shape)
+        .Attr("transpose_first", transpose_first)
+        .Run();
+    return result;
+}
diff --git a/ads/common/ops/csrc/FastGeluKernelNpu.cpp b/ads/common/ops/csrc/FastGeluKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a56dcb7a0674cf7c33f84bd261675f04d340db60
--- /dev/null
+++ b/ads/common/ops/csrc/FastGeluKernelNpu.cpp
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+namespace {
+at::Tensor& fast_gelu_backward_npu_nocheck(
+    at::Tensor& grad_input,
+    const at::Tensor& grad,
+    const at::Tensor& self)
+{
+    at_npu::native::OpCommand cmd;
+    cmd.Name("FastGeluGrad")
+        .Input(grad)
+        .Input(self)
+        .Output(grad_input)
+        .Run();
+    return grad_input;
+}
+} // namespace
+
+at::Tensor npu_fast_gelu(const at::Tensor& self)
+{
+    at::Tensor result = at::empty(self.sizes(), self.options());
+    at_npu::native::OpCommand cmd;
+    cmd.Name("FastGelu")
+        .Input(self)
+        .Output(result)
+        .Run();
+    return result;
+}
+
+at::Tensor npu_fast_gelu_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self)
+{
+    at::Tensor grad_input = at::empty(self.sizes(), self.options());
+    fast_gelu_backward_npu_nocheck(grad_input, grad, self);
+    return grad_input;
+}
diff --git a/ads/common/ops/csrc/NpuSilu.cpp b/ads/common/ops/csrc/NpuSilu.cpp
index 0b81c0f426f6b0d0a7954666ad141ae9eac4e502..4f62cad5862989d2a3259b2a6852b749767383a8 100644
--- a/ads/common/ops/csrc/NpuSilu.cpp
+++ b/ads/common/ops/csrc/NpuSilu.cpp
@@ -1,18 +1,7 @@
-#include <torch/csrc/autograd/custom_function.h>
-
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 #include "common.h"
 
-using torch::autograd::AutogradContext;
-using torch::autograd::Function;
-using npu_preparation = at_npu::native::OpPreparation;
-using npu_utils = at_npu::native::NpuUtils;
-using tensor_list = std::vector<at::Tensor>;
-
 at::Tensor &silu_out_npu_nocheck(at::Tensor &result, const at::Tensor &self)
 {
     at_npu::native::OpCommand cmd;
@@ -80,4 +69,4 @@ at::Tensor &npu_silu_(at::Tensor &self)
 {
     silu_out_npu(self, self);
     return self;
-}
\ No newline at end of file
+}
diff --git a/ads/common/ops/csrc/RotaryMulKernelNpu.cpp b/ads/common/ops/csrc/RotaryMulKernelNpu.cpp
index 3e814e4e96956c28c1f4ef29c949db742da1e919..055693091b9e88d8c1b66cfe078cee143b8e43fc 100644
--- a/ads/common/ops/csrc/RotaryMulKernelNpu.cpp
+++ b/ads/common/ops/csrc/RotaryMulKernelNpu.cpp
@@ -14,18 +14,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <torch/csrc/autograd/custom_function.h>
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "functions.h"
 #include "common.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
-using torch::autograd::Function;
-using torch::autograd::AutogradContext;
 using tensor_tuple = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
 
 namespace {
@@ -42,7 +34,7 @@ at::Tensor &rotary_mul_nocheck(at::Tensor &y, const at::Tensor &x, const at::Ten
     return y;
 }
 
-tensor_tuple rotary_mul_backward_nocheck(at::Tensor &dx, at::Tensor &dr1, at::Tensor &dr2, const at::Tensor &x, 
+tensor_tuple rotary_mul_backward_nocheck(at::Tensor &dx, at::Tensor &dr1, at::Tensor &dr2, const at::Tensor &x,
     const at::Tensor &r1, const at::Tensor &r2, const at::Tensor &dy)
 {
     TORCH_CHECK(x.dim() == 4, "The dim of input tensor [x] shoule equal to four.");
diff --git a/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp b/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp
index db949fc984bb204bd3fd7ff8d3700f429b7b5bdd..0e8aa592f4f7285c6b8f78cc89125e5722615b20 100644
--- a/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp
+++ b/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp
@@ -15,13 +15,8 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
-using npu_utils = at_npu::native::NpuUtils;
 
 at::Tensor npu_rotated_box_decode(const at::Tensor &self, const at::Tensor &deltas, const at::Tensor &weight)
 {
diff --git a/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp b/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp
index cfe515bac88e0d668483c1a2e88e1172c34d3db3..865b994b622f123e71875831977ef48ffeb2df54 100644
--- a/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp
+++ b/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp
@@ -15,13 +15,8 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
-
 at::Tensor npu_rotated_box_encode(
     const at::Tensor &self,
     const at::Tensor &gtBox,
diff --git a/ads/common/ops/csrc/RotatedIouKernelNpu.cpp b/ads/common/ops/csrc/RotatedIouKernelNpu.cpp
index 7c8c334a8ec52fd01d7ff5c5a71b85bdd1f383e6..dc94943dcda69ae9cdcb27d330bc1c8ae53212e9 100644
--- a/ads/common/ops/csrc/RotatedIouKernelNpu.cpp
+++ b/ads/common/ops/csrc/RotatedIouKernelNpu.cpp
@@ -15,14 +15,8 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "functions.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
-
 namespace {
 at::Tensor &rotated_iou_npu_nocheck(
     at::Tensor &iou,
diff --git a/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp b/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp
index ac476c6299673125071391f1549796457f0db397..0a957ca9b8986df6e4c4b3c9cbd8330896e3bd50 100644
--- a/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp
+++ b/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp
@@ -15,14 +15,8 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "functions.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
-
 namespace {
 at::Tensor &rotated_overlaps_npu_nocheck(
     at::Tensor &overlaps,
diff --git a/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp b/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp
index c4e943842d775364a5ba471deae2589945cf1f48..f3b11664cc4fb99fd03dbaa7b7dd4f3b8be6e8c7 100644
--- a/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp
+++ b/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp
@@ -1,16 +1,8 @@
-#include <torch/extension.h>
-
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
+#include "common.h"
 
-using namespace at;
 using namespace std;
 
-using torch::autograd::Function;
-using torch::autograd::AutogradContext;
-using tensor_list = std::vector<at::Tensor>;
-
 std::tuple<at::Tensor, at::Tensor> npu_scatter_max(
     const at::Tensor& updates,
     const at::Tensor& indices,
@@ -21,7 +13,7 @@ std::tuple<at::Tensor, at::Tensor> npu_scatter_max(
     sizes[0] = indices.max().item().toLong() + 1;
 
     at::Tensor result = out.value_or(at::zeros(sizes, updates.options().dtype(at::kFloat)));
-    at::Tensor argmax = at_npu::native::OpPreparation::ApplyTensor(result, result.options().dtype(at::kInt));
+    at::Tensor argmax = at::empty(result.sizes(), result.options().dtype(at::kInt));
 
     at_npu::native::OpCommand cmd;
     cmd.Name("ScatterMaxWithArgmax")
@@ -37,7 +29,7 @@ std::tuple<at::Tensor, at::Tensor> npu_scatter_max(
 
 at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments)
 {
-    c10::SmallVector<int64_t, at_npu::native::N> output_size;
+    c10::SmallVector<int64_t, N> output_size;
 
     auto num_segments_value = num_segments.item().toLong();
     output_size.push_back(num_segments_value);
@@ -47,7 +39,7 @@ at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segme
 
     copy(x_sizes.begin() + segment_ids_dims, x_sizes.end(), std::back_inserter(output_size));
 
-    at::Tensor out = at_npu::native::OpPreparation::ApplyTensor(x, output_size);
+    at::Tensor out = at::empty(output_size, x.options());
     at_npu::native::OpCommand cmd;
     cmd.Name("UnsortedSegmentSum")
         .Input(x)
diff --git a/ads/common/ops/csrc/ScatterV1KernelNpu.cpp b/ads/common/ops/csrc/ScatterV1KernelNpu.cpp
index 155ea3839ebca5494976551861a731ac0465e7cc..f96d460819aecdbf0acee953604fe5766d300c88 100644
--- a/ads/common/ops/csrc/ScatterV1KernelNpu.cpp
+++ b/ads/common/ops/csrc/ScatterV1KernelNpu.cpp
@@ -15,12 +15,8 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
 
 at::Tensor npu_scatter(const at::Tensor &self, const at::Tensor &indices, const at::Tensor &updates, int64_t dim)
 {
diff --git a/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp b/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp
index 5fb1139c5ff1d398c63bc6470e4dbf543f062f19..95f4c3ffaad06f6b3db554cd73ff4777d1216300 100644
--- a/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp
+++ b/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp
@@ -15,12 +15,8 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
 
 at::Tensor npu_sign_bits_pack(const at::Tensor &self, int64_t size)
 {
diff --git a/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp b/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp
index e7a35680b65194544d646cb6a584c35d62b510f7..27ae440bed26bf974991218bc8294ac126d2fea1 100644
--- a/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp
+++ b/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp
@@ -15,13 +15,9 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 #include "common.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
 
 at::Tensor npu_sign_bits_unpack_compute(
     const at::Tensor &input,
diff --git a/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp b/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp
index e936c81945a5fc3268ec6a47dd715b493017d2e2..cc8f95dff0b0072324b875630059a7fbfa6fbf5c 100644
--- a/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp
+++ b/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp
@@ -14,19 +14,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <torch/csrc/autograd/custom_function.h>
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 #include "common.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
-using torch::autograd::AutogradContext;
-using torch::autograd::Function;
-using tensor_list = std::vector<at::Tensor>;
-
 namespace {
 std::tuple<at::Tensor &, at::Tensor &> softmax_cross_entropy_with_logits_out_nocheck(
     at::Tensor &result,
diff --git a/ads/common/ops/csrc/StrideAddKernelNpu.cpp b/ads/common/ops/csrc/StrideAddKernelNpu.cpp
index ebcfbfda634f8341c0f17989c136ee7b78a1f326..47922f62715d095724e7721528a7f085dcd498fd 100644
--- a/ads/common/ops/csrc/StrideAddKernelNpu.cpp
+++ b/ads/common/ops/csrc/StrideAddKernelNpu.cpp
@@ -14,15 +14,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <torch/csrc/autograd/custom_function.h>
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 #include "common.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
 
 namespace {
 at::Tensor &stride_add_out_npu_nocheck(
diff --git a/ads/common/ops/csrc/TransposeKernelNpu.cpp b/ads/common/ops/csrc/TransposeKernelNpu.cpp
index ad9d2e972b8d5b789b06a08ec774ef8e10b29c5e..2e8705c2a79c10e15c1cfed71cfdaf2c1ce4d754 100644
--- a/ads/common/ops/csrc/TransposeKernelNpu.cpp
+++ b/ads/common/ops/csrc/TransposeKernelNpu.cpp
@@ -15,14 +15,9 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "functions.h"
 #include "common.h"
 
-using npu_utils = at_npu::native::NpuUtils;
-
 namespace {
 at::Tensor &npu_transpose_out_nocheck(
     at::Tensor &result,
diff --git a/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp b/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp
index f3cd4201329c5624d7f4098f3f505c29375c67e2..df02a325f4cb3fda9ab8bb2b8ad807629be6d7f5 100644
--- a/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp
+++ b/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp
@@ -15,14 +15,9 @@
 // limitations under the License.
 
 #include "torch_npu/csrc/framework/OpCommand.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "functions.h"
 #include "common.h"
 
-using npu_preparation = at_npu::native::OpPreparation;
 
 namespace {
 inline void yolo_boxes_encode_check(
diff --git a/ads/common/ops/csrc/common.cpp b/ads/common/ops/csrc/common.cpp
index 8e4b30373a39880632f0526d762f34e493a50896..f6f9cc495143311ed326cf79f3391bd827f5c1fa 100644
--- a/ads/common/ops/csrc/common.cpp
+++ b/ads/common/ops/csrc/common.cpp
@@ -1,12 +1,9 @@
 #include <unordered_map>
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "common.h"
 
-using npu_utils = at_npu::native::NpuUtils;
 using CalcuOpUtil = at_npu::native::CalcuOpUtil;
 
 #define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_)  \
@@ -192,4 +189,4 @@ bool check_match(const at::Tensor &self)
 void format_fresh_view(at::Tensor &x, const at::Tensor &y)
 {
     x.copy_(y);
-}
\ No newline at end of file
+}
diff --git a/ads/common/ops/csrc/common.h b/ads/common/ops/csrc/common.h
index 49a7565312288dafc64ac4103909da514b2ddb69..95c2b5a194029b37893ce7b45651ea3cb9d315d2 100644
--- a/ads/common/ops/csrc/common.h
+++ b/ads/common/ops/csrc/common.h
@@ -1,3 +1,5 @@
+#ifndef __COMMON_H__
+#define __COMMON_H__
 #include <ATen/ATen.h>
 #include <string>
 #include <tuple>
@@ -26,4 +28,6 @@ c10::SmallVector<int64_t, N> convert_array_to_vector(c10::IntArrayRef intArray);
 c10::SmallVector<int64_t, SIZE> infersize_stride_add(c10::IntArrayRef shape1_, c10::IntArrayRef shape2_);
 c10::SmallVector<int64_t, SIZE> transpose_npu_output_size(const at::Tensor &self, c10::IntArrayRef perm);
 bool check_match(const at::Tensor &self);
-void format_fresh_view(at::Tensor &x, const at::Tensor &y); 
\ No newline at end of file
+void format_fresh_view(at::Tensor &x, const at::Tensor &y);
+
+#endif // __COMMON_H__
diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h
index 243774ab2a4fd992c4bb3676d7f9c30f4d1cc7b1..0afa2edce5b9de5e41969fbd8b7d91855377f509 100644
--- a/ads/common/ops/csrc/functions.h
+++ b/ads/common/ops/csrc/functions.h
@@ -1,3 +1,19 @@
+// Copyright (c) 2023, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef __FUNCTIONS_H__
+#define __FUNCTIONS_H__
+
 #include <ATen/Tensor.h>
 #include <ATen/ATen.h>
 #include <ATen/core/Scalar.h>
@@ -43,3 +59,62 @@ at::Tensor npu_rotary_mul(const at::Tensor &self, const at::Tensor &r1, const at
 at::Tensor npu_silu(const at::Tensor& self);
 at::Tensor& npu_silu_(at::Tensor& self);
 at::Tensor npu_abs(const at::Tensor& self);
+at::Tensor npu_fast_gelu_backward(const at::Tensor& grad, const at::Tensor& self);
+at::Tensor npu_fast_gelu(const at::Tensor& self);
+at::Tensor npu_anchor_response_flags(const at::Tensor& self, at::IntArrayRef featmap_size, at::IntArrayRef stride, int64_t num_base_anchors);
+at::Tensor npu_bounding_box_decode(
+    const at::Tensor& rois,
+    const at::Tensor& deltas,
+    double means0,
+    double means1,
+    double means2,
+    double means3,
+    double stds0,
+    double stds1,
+    double stds2,
+    double stds3,
+    at::IntArrayRef max_shape,
+    double wh_ratio_clip);
+at::Tensor npu_bounding_box_encode(
+    const at::Tensor& anchor_box,
+    const at::Tensor& ground_truth_box,
+    double means0,
+    double means1,
+    double means2,
+    double means3,
+    double stds0,
+    double stds1,
+    double stds2,
+    double stds3);
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> npu_batch_nms(
+    const at::Tensor& self,
+    const at::Tensor& scores,
+    double score_threshold,
+    double iou_threshold,
+    int64_t max_size_per_class,
+    int64_t max_total_size,
+    bool change_coordinate_frame,
+    bool transpose_box);
+at::Tensor npu_confusion_transpose(
+    const at::Tensor& self,
+    at::IntArrayRef perm,
+    at::IntArrayRef shape,
+    bool transpose_first);
+at::Tensor npu_confusion_transpose_backward(
+    const at::Tensor& grad,
+    at::IntArrayRef perm,
+    at::IntArrayRef shape,
+    bool transpose_first);
+at::Tensor npu_conv_transpose2d(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias_opt,
+    at::IntArrayRef padding,
+    at::IntArrayRef output_padding,
+    at::IntArrayRef stride,
+    at::IntArrayRef dilation,
+    int64_t groups);
+at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size);
+at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result);
+
+#endif // __FUNCTIONS_H__
diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp
index 4eb1cf6fcc8093268c3074b93e675921f0027e17..b8ebe3f5250e9add59301ffab14216dcf2b18539 100644
--- a/ads/common/ops/csrc/pybind.cpp
+++ b/ads/common/ops/csrc/pybind.cpp
@@ -43,4 +43,27 @@ void init_common(pybind11::module &m)
     m.def("npu_rotary_mul", &npu_rotary_mul);
 
     m.def("npu_abs", &npu_abs);
+
+    // npu_fast_gelu
+    m.def("npu_fast_gelu", &npu_fast_gelu);
+    m.def("npu_fast_gelu_backward", &npu_fast_gelu_backward);
+
+    // npu_anchor_response_flags
+    m.def("npu_anchor_response_flags", &npu_anchor_response_flags);
+
+    // npu_bounding_box_decode
+    m.def("npu_bounding_box_decode", &npu_bounding_box_decode);
+
+    // npu_bounding_box_encode
+    m.def("npu_bounding_box_encode", &npu_bounding_box_encode);
+
+    // npu_batch_nms
+    m.def("npu_batch_nms", &npu_batch_nms);
+
+    // npu_confusion_transpose
+    m.def("npu_confusion_transpose", &npu_confusion_transpose);
+    m.def("npu_confusion_transpose_backward", &npu_confusion_transpose_backward);
+
+    // npu_broadcast
+    m.def("npu_broadcast", &npu_broadcast);
 }
diff --git a/ads/common/ops/fast_gelu.py b/ads/common/ops/fast_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..45557513e9ca889004a63c0822065630a8723334
--- /dev/null
+++ b/ads/common/ops/fast_gelu.py
@@ -0,0 +1,23 @@
+import torch
+from torch.autograd import Function
+
+import torch_npu
+import ads_c
+
+
+class FastGeluFunction(Function):
+    @staticmethod
+    def forward(ctx, self):
+        out = ads_c.npu_fast_gelu(self)
+        ctx.save_for_backward(self)
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        self = ctx.saved_tensors[0]
+
+        grad = ads_c.npu_fast_gelu_backward(grad_output, self)
+
+        return grad
+
+fast_gelu = FastGeluFunction.apply
diff --git a/ads/common/ops/npu_anchor_response_flags.py b/ads/common/ops/npu_anchor_response_flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..b75fd77c218def485efdc8cbace1fd1767c917bc
--- /dev/null
+++ b/ads/common/ops/npu_anchor_response_flags.py
@@ -0,0 +1,13 @@
+import torch
+from torch.autograd import Function
+import torch_npu
+import ads_c
+
+
+class NpuAnchorResponseFlagsFunction(Function):
+    @staticmethod
+    def forward(ctx, self, featmap_size, stride, num_base_anchors):
+        result = ads_c.npu_anchor_response_flags(self, featmap_size, stride, num_base_anchors)
+        return result
+
+npu_anchor_response_flags = NpuAnchorResponseFlagsFunction.apply
diff --git a/ads/common/ops/npu_batch_nms.py b/ads/common/ops/npu_batch_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5b2ef97a154db5d1969757cc28d08d49c07d28
--- /dev/null
+++ b/ads/common/ops/npu_batch_nms.py
@@ -0,0 +1,30 @@
+import torch
+from torch.autograd import Function
+import torch_npu
+import ads_c
+
+
+class NpuBatchNmsFunction(Function):
+    @staticmethod
+    def forward(
+            ctx,
+            self,
+            scores,
+            score_threshold,
+            iou_threshold,
+            max_size_per_class,
+            max_total_size,
+            change_coordinate_frame=False,
+            transpose_box=False):
+        result = ads_c.npu_batch_nms(
+            self,
+            scores,
+            score_threshold,
+            iou_threshold,
+            max_size_per_class,
+            max_total_size,
+            change_coordinate_frame,
+            transpose_box)
+        return result
+
+npu_batch_nms = NpuBatchNmsFunction.apply
diff --git a/ads/common/ops/npu_bounding_box_decode.py b/ads/common/ops/npu_bounding_box_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f16d8e4629f5ead83b4b259b593f7fc1015711c
--- /dev/null
+++ b/ads/common/ops/npu_bounding_box_decode.py
@@ -0,0 +1,20 @@
+import torch
+from torch.autograd import Function
+import torch_npu
+import ads_c
+
+
+class NpuBoundingBodDecodeFunction(Function):
+    @staticmethod
+    def forward(ctx, rois, deltas,
+            means0, means1, means2, means3,
+            stds0, stds1, stds2, stds3,
+            max_shape, wh_ratio_clip):
+        result = ads_c.npu_bounding_box_decode(
+                    rois, deltas,
+                    means0, means1, means2, means3,
+                    stds0, stds1, stds2, stds3,
+                    max_shape, wh_ratio_clip)
+        return result
+
+npu_bounding_box_decode = NpuBoundingBodDecodeFunction.apply
diff --git a/ads/common/ops/npu_bounding_box_encode.py b/ads/common/ops/npu_bounding_box_encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..756e1fe0b933f955ecc308ac2881faaaeb6f8bb1
--- /dev/null
+++ b/ads/common/ops/npu_bounding_box_encode.py
@@ -0,0 +1,18 @@
+import torch
+from torch.autograd import Function
+import torch_npu
+import ads_c
+
+
+class NpuBoundingBodEncodeFunction(Function):
+    @staticmethod
+    def forward(ctx, anchor_box, ground_truth_box,
+            means0, means1, means2, means3,
+            stds0, stds1, stds2, stds3):
+        result = ads_c.npu_bounding_box_encode(
+                    anchor_box, ground_truth_box,
+                    means0, means1, means2, means3,
+                    stds0, stds1, stds2, stds3)
+        return result
+
+npu_bounding_box_encode = NpuBoundingBodEncodeFunction.apply
\ No newline at end of file
diff --git a/ads/common/ops/npu_broadcast.py b/ads/common/ops/npu_broadcast.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3371b283a10ca9b87fd6bd2bbe7cc85fdb40fe3
--- /dev/null
+++ b/ads/common/ops/npu_broadcast.py
@@ -0,0 +1,16 @@
+import torch
+from torch.autograd import Function
+import torch_npu
+import ads_c
+
+
+class BroadCastlFunction(Function):
+    @staticmethod
+    def forward(ctx, self, size, out=None):
+        if out is None:
+            result = ads_c.npu_broadcast(self, size)
+        else:
+            result = ads_c.npu_broadcast_out(self, size, out)
+        return result
+
+npu_broadcast = BroadCastlFunction.apply
\ No newline at end of file
diff --git a/ads/common/ops/npu_confusion_transpose.py b/ads/common/ops/npu_confusion_transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..566d19f1538e9e7fb229a549aeb2278f9d140d15
--- /dev/null
+++ b/ads/common/ops/npu_confusion_transpose.py
@@ -0,0 +1,24 @@
+import torch
+from torch.autograd import Function
+from torch.nn import Module
+
+import torch_npu
+import ads_c
+
+
+class NpuConfusionTransposeFunction(Function):
+    @staticmethod
+    def forward(ctx, self, perm, shape, transpose_first):
+        out = ads_c.npu_confusion_transpose(self, perm, shape, transpose_first)
+        ctx.save_for_backward(perm, self.sizes(), transpose_first)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        perm, sefl_sizes, transpose_first = ctx.saved_tensors
+        out = ads_c.npu_confusion_transpose_backward(grad_output, perm, sefl_sizes, not transpose_first)
+
+        return out, None, None, None
+
+npu_confusion_transpose = NpuConfusionTransposeFunction.apply
diff --git a/ads/common/ops/rotary_mul.py b/ads/common/ops/rotary_mul.py
index bf9c2a9b05288e9e15125585feb97ecbaf5cadd9..5079961bb1c4014740917a7e803b02230bcff962 100644
--- a/ads/common/ops/rotary_mul.py
+++ b/ads/common/ops/rotary_mul.py
@@ -19,4 +19,4 @@ class RotaryMulFunction(Function):
         result = ads_c.npu_rotary_mul_backward(grad_output, input, r1, r2)
         return result
 
-npu_rotary_mul = RotaryMulFunction.apply
\ No newline at end of file
+npu_rotary_mul = RotaryMulFunction.apply
diff --git a/ads/common/ops/rotated_iou.py b/ads/common/ops/rotated_iou.py
index d88d3e9b5c03470a26d0463e39a451b630429c2a..896f001cfb36b75c8bde3da42e567654f7680b69 100644
--- a/ads/common/ops/rotated_iou.py
+++ b/ads/common/ops/rotated_iou.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_rotated_iou = ads_c.npu_rotated_iou
\ No newline at end of file
+npu_rotated_iou = ads_c.npu_rotated_iou
diff --git a/ads/common/ops/rotated_overlaps.py b/ads/common/ops/rotated_overlaps.py
index 407532351c90419c3f2fec71dda686a015232019..c481fe6fdfed168f130aadd268437bba2d5bd3b9 100644
--- a/ads/common/ops/rotated_overlaps.py
+++ b/ads/common/ops/rotated_overlaps.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_rotated_overlaps = ads_c.npu_rotated_overlaps
\ No newline at end of file
+npu_rotated_overlaps = ads_c.npu_rotated_overlaps
diff --git a/ads/common/ops/scatter.py b/ads/common/ops/scatter.py
index 7d89109cb8505c46fcb21b1fe9d65beb4a992208..d9e6de8ae0d69bc583e3545aeee27d13c9b39e57 100644
--- a/ads/common/ops/scatter.py
+++ b/ads/common/ops/scatter.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_scatter = ads_c.npu_scatter
\ No newline at end of file
+npu_scatter = ads_c.npu_scatter
diff --git a/ads/common/ops/sign_bits_pack.py b/ads/common/ops/sign_bits_pack.py
index c09d486ae571edf6498514e28bdcf9b1db543877..7b1e0040dffb3bcd6ccaa7f9a00ab05231a5cec9 100644
--- a/ads/common/ops/sign_bits_pack.py
+++ b/ads/common/ops/sign_bits_pack.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_sign_bits_pack = ads_c.npu_sign_bits_pack
\ No newline at end of file
+npu_sign_bits_pack = ads_c.npu_sign_bits_pack
diff --git a/ads/common/ops/sign_bits_unpack.py b/ads/common/ops/sign_bits_unpack.py
index efa1a2dd409f168ce850bf79746f2b8986469401..ed374e17f3b22b513ebc88ed2cf141f36940671b 100644
--- a/ads/common/ops/sign_bits_unpack.py
+++ b/ads/common/ops/sign_bits_unpack.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_sign_bits_unpack = ads_c.npu_sign_bits_unpack
\ No newline at end of file
+npu_sign_bits_unpack = ads_c.npu_sign_bits_unpack
diff --git a/ads/common/ops/silu.py b/ads/common/ops/silu.py
index 8ca866dbc07e36c05f0fab507935947c95fe4304..bd4251b0079fe993e8cb3e9554971fac0f5d72a8 100644
--- a/ads/common/ops/silu.py
+++ b/ads/common/ops/silu.py
@@ -13,7 +13,7 @@ class SiluFunction(Function):
         result = func(input)
         ctx.save_for_backward(input, result)
         return result
-    
+
     @staticmethod
     def backward(ctx, grad_outputs):
         x0, x1 = ctx.saved_tensors
@@ -22,4 +22,4 @@ class SiluFunction(Function):
 
 npu_silu = SiluFunction.apply
 
-npu_silu_ = ads_c.npu_silu_
\ No newline at end of file
+npu_silu_ = ads_c.npu_silu_
diff --git a/ads/common/ops/softmax_cross_entropy_with_logits.py b/ads/common/ops/softmax_cross_entropy_with_logits.py
index f09d2a3e74b77d500d5ad579de32e0996893127a..cd12c5dd58582adce1960c0a0a353a1483b0c706 100644
--- a/ads/common/ops/softmax_cross_entropy_with_logits.py
+++ b/ads/common/ops/softmax_cross_entropy_with_logits.py
@@ -20,4 +20,4 @@ class SoftMaxFunction(Function):
         result = ads_c.npu_softmax_cross_entropy_with_logits_backward(grad_output, feature, labels)
         return result
 
-npu_softmax_cross_entropy_with_logits = SoftMaxFunction.apply
\ No newline at end of file
+npu_softmax_cross_entropy_with_logits = SoftMaxFunction.apply
diff --git a/ads/common/ops/stride_add.py b/ads/common/ops/stride_add.py
index 24a3946b2741cd6e47b3ee4c834e248f54fe5fc9..586a83c35ae6c915826f953b81cdad5da543f62d 100644
--- a/ads/common/ops/stride_add.py
+++ b/ads/common/ops/stride_add.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_stride_add = ads_c.npu_stride_add
\ No newline at end of file
+npu_stride_add = ads_c.npu_stride_add
diff --git a/ads/common/ops/transpose.py b/ads/common/ops/transpose.py
index 149722993e3ca9946f1ae43d8f766715418b077d..a27dca7dd62c026c4725954074e1511d5ee75f52 100644
--- a/ads/common/ops/transpose.py
+++ b/ads/common/ops/transpose.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_transpose = ads_c.npu_transpose
\ No newline at end of file
+npu_transpose = ads_c.npu_transpose
diff --git a/ads/common/ops/yolo_boxes_encode.py b/ads/common/ops/yolo_boxes_encode.py
index 585adb586d2d5dd63dbe2734d7995d50a7c6387f..cb915a0f586b86dbde0bccfe48715ef625b0fb5c 100644
--- a/ads/common/ops/yolo_boxes_encode.py
+++ b/ads/common/ops/yolo_boxes_encode.py
@@ -2,4 +2,4 @@ import torch
 import torch_npu
 import ads_c
 
-npu_yolo_boxes_encode = ads_c.npu_yolo_boxes_encode
\ No newline at end of file
+npu_yolo_boxes_encode = ads_c.npu_yolo_boxes_encode
diff --git a/tests/test_batch_nms.py b/tests/test_batch_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c3245c510a1be5d72ccbbbd6391cdde795d30f
--- /dev/null
+++ b/tests/test_batch_nms.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch_npu
+
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+import ads.common
+
+
+class TesBatchNms(TestCase):
+    def test_batch_nms_shape_format(self):
+        boxes = torch.randn(8, 4, 1, 4).npu()
+        scores = torch.randn(8, 4, 1).npu()
+        boxes_fp16 = boxes.half()
+        scores_fp16 = scores.half()
+        nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = ads.common.npu_batch_nms(boxes, scores, 0.3, 0.5, 4, 4)
+        boxes1, scores1, classes1, num1 = ads.common.npu_batch_nms(boxes_fp16, scores_fp16, 0.3, 0.5, 4, 4)
+        expedt_nmsed_classes = torch.tensor([[0.0000, 0.0000, 0.0000, 0.0000],
+                                             [0.0000, 0.0000, 0.0000, 0.0000],
+                                             [0.0000, 0.0000, 0.0000, 0.0000],
+                                             [0.0000, 0.0000, 0.0000, 0.0000],
+                                             [0.0000, 0.0000, 0.0000, 0.0000],
+                                             [0.0000, 0.0000, 0.0000, 0.0000],
+                                             [0.0000, 0.0000, 0.0000, 0.0000],
+                                             [0.0000, 0.0000, 0.0000, 0.0000]], dtype=torch.float32)
+        self.assertRtolEqual(expedt_nmsed_classes, nmsed_classes.cpu())
+        self.assertRtolEqual(expedt_nmsed_classes.half(), classes1.cpu())
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/test_fast_gelu.py b/tests/test_fast_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81b5e92ae5fc72463ef2e2aeda41879ed9c1055
--- /dev/null
+++ b/tests/test_fast_gelu.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2023 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+import ads.common
+
+
+class TestFastGelu(TestCase):
+
+    def supported_op_exec(self, input1):
+        attr = 1.702
+        attr_half = attr / 2
+        abs_input1 = torch.abs(input1)
+        numerator = input1 * \
+            torch.exp((attr_half * input1) * (input1 - abs_input1))
+        denominator = 1.0 + torch.exp(- attr * abs_input1)
+        output = numerator / denominator
+        return output.cpu().detach()
+
+    def custom_op_exec(self, input1):
+        output = ads.common.fast_gelu(input1)
+        return output.cpu().detach()
+
+    def test_fast_gelu(self, device="npu"):
+        item = [np.float32, 0, [3, 16, 32]]
+        _, npu_input = create_common_tensor(item, 0, 100)
+
+        supported_output = self.supported_op_exec(npu_input)
+        custom_output = self.custom_op_exec(npu_input)
+        self.assertRtolEqual(supported_output, custom_output)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/test_fast_gelu_backward.py b/tests/test_fast_gelu_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c7920ef1bc3d7b4aaf9f1004db348e6f66980bf
--- /dev/null
+++ b/tests/test_fast_gelu_backward.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import ads.common
+
+
+class TestFastGelu(TestCase):
+    def npu_op_exec(self, input1):
+        input1.requires_grad = True
+        output = ads.common.fast_gelu(input1)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output = output.cpu().detach().numpy()
+        return output_grad, output
+
+    def test_fastgelu(self, device="npu"):
+        input1 = torch.tensor([1., 2., 3., 4.]).npu()
+        exoutputgrad = torch.tensor([1.0677795, 1.0738151, 1.0245483, 1.0064018])
+        exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
+        outputgrad, output = self.npu_op_exec(input1)
+        self.assertRtolEqual(exoutputgrad.numpy(), outputgrad)
+        self.assertRtolEqual(exoutput.numpy(), output)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/test_npu_anchor_response_flags.py b/tests/test_npu_anchor_response_flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..b656ee94af4f88995a9aaf7b6a454a9439ed6ec0
--- /dev/null
+++ b/tests/test_npu_anchor_response_flags.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import create_common_tensor
+import ads.common
+
+
+class TestNpuAnchorResponseFlags(TestCase):
+    def custom_op_exec(self, gt_bboxes, featmap_size, strides, num_base_anchors):
+        if gt_bboxes.dtype == torch.float16:
+            gt_bboxes = gt_bboxes.to(torch.float32)
+        feat_h, feat_w = featmap_size
+        gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5)
+        gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5)
+        gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / strides[0]).int()
+        gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / strides[1]).int()
+        gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
+        responsible_grid = torch.zeros(feat_h * feat_w, dtype=torch.uint8).npu()
+        gt_bboxes_grid_idx = gt_bboxes_grid_idx.long()
+        responsible_grid[gt_bboxes_grid_idx] = 1
+        responsible_grid = responsible_grid[:, None].expand(
+            responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
+        return responsible_grid.cpu().numpy()
+
+    def npu_op_exec(self, input_npu, featmap_size, strides, num_base_anchors):
+        out = ads.common.npu_anchor_response_flags(input_npu, featmap_size, strides, num_base_anchors)
+        out = out.cpu().numpy()
+        return out
+
+    def test_npu_anchor_response_flags(self):
+        shape_format = [
+            [[np.float32, -1, [100, 4]], [60, 60], [2, 2], 9],
+            [[np.float16, -1, [200, 4]], [10, 10], [32, 32], 3],
+            [[np.float16, -1, [500, 4]], [32, 32], [16, 16], 5]
+        ]
+        for item in shape_format:
+            _, npu_input = create_common_tensor(item[0], 0, 100)
+            custom_output = self.custom_op_exec(npu_input, *item[1:])
+            npu_output = self.npu_op_exec(npu_input, *item[1:])
+            self.assertRtolEqual(custom_output, npu_output)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/test_npu_bounding_box_decode.py b/tests/test_npu_bounding_box_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..248fe36c8b756faa91610bbb3e0190c99a1d19f4
--- /dev/null
+++ b/tests/test_npu_bounding_box_decode.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import ads.common
+
+
+class TestBoundingBoxDecode(TestCase):
+    def npu_bounding_box_decode(self, rois, deltas, means0, means1, means2, means3,
+                                stds0, stds1, stds2, stds3, max_shape, wh_ratio_clip):
+        means = [means0, means1, means2, means3]
+        stds = [stds0, stds1, stds2, stds3]
+        means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+        stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+        denorm_deltas = deltas * stds + means
+
+        dx = denorm_deltas[:, 0::4]
+        dy = denorm_deltas[:, 1::4]
+        dw = denorm_deltas[:, 2::4]
+        dh = denorm_deltas[:, 3::4]
+        max_ratio = torch.abs(torch.log(torch.tensor(wh_ratio_clip)))
+
+        dw = torch.clamp(dw, min=-max_ratio, max=max_ratio)
+        dh = torch.clamp(dh, min=-max_ratio, max=max_ratio)
+
+        ax = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+        ay = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+        aw = (rois[:, 2] - rois[:, 0] * 0.5).unsqueeze(1).expand_as(dw)
+        ah = (rois[:, 3] - rois[:, 1] * 0.5).unsqueeze(1).expand_as(dh)
+
+        pw = aw * dw.exp()
+        ph = ah * dh.exp()
+        px = torch.addcmul(ax, 1, aw, dx)
+        py = torch.addcmul(ay, 1, ah, dy)
+
+        x1 = px - pw * 0.5 + 0.5
+        y1 = py - ph * 0.5 + 0.5
+        x2 = px + pw * 0.5 - 0.5
+        y2 = py + ph * 0.5 - 0.5
+
+        if max_shape is not None:
+            x1 = torch.clamp(x1, min=0, max=(max_shape[1] - 1))
+            y1 = torch.clamp(y1, min=0, max=(max_shape[0] - 1))
+            x2 = torch.clamp(x2, min=0, max=(max_shape[1] - 1))
+            y2 = torch.clamp(y2, min=0, max=(max_shape[0] - 1))
+        boxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+        return boxes
+
+    def custom_op_exec(self, rois, deltas, means0, means1, means2, means3,
+                       stds0, stds1, stds2, stds3, max_shape, wh_ratio_clip):
+        output = self.npu_bounding_box_decode(rois, deltas, means0, means1,
+                                              means2, means3, stds0, stds1,
+                                              stds2, stds3, max_shape, wh_ratio_clip)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, rois, deltas, means0, means1, means2, means3,
+                    stds0, stds1, stds2, stds3, max_shape, wh_ratio_clip):
+        output = ads.common.npu_bounding_box_decode(rois, deltas, means0, means1,
+                                                   means2, means3, stds0, stds1,
+                                                   stds2, stds3, max_shape, wh_ratio_clip)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_decode_shape_format_fp32(self):
+        input1 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]],
+                              dtype=torch.float32).to("npu")
+        input2 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]],
+                              dtype=torch.float32).to("npu")
+
+        npu_output = self.npu_op_exec(input1, input2, 0, 0, 0, 0,
+                                      1, 1, 1, 1, (10, 10), 0.1)
+        custom_output = self.custom_op_exec(input1, input2, 0, 0, 0, 0,
+                                            1, 1, 1, 1, (10, 10), 0.1)
+        self.assertRtolEqual(npu_output, custom_output)
+
+    def test_decode_shape_format_fp16(self):
+        input1_fp16 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]],
+                                   dtype=torch.float16).to("npu")
+        input2_fp16 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]],
+                                   dtype=torch.float16).to("npu")
+
+        npu_output = self.npu_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0,
+                                      1, 1, 1, 1, (10, 10), 0.1)
+        custom_output = self.custom_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0,
+                                            1, 1, 1, 1, (10, 10), 0.1)
+        self.assertRtolEqual(npu_output, custom_output)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/test_npu_bounding_box_encode.py b/tests/test_npu_bounding_box_encode.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6a406b68c18c2d414717402c75833542901117b
--- /dev/null
+++ b/tests/test_npu_bounding_box_encode.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import ads.common
+
+
+class TestBoundingBoxEncode(TestCase):
+    def npu_bounding_box_encode(self, anchor_box, ground_truth_box, means0, means1,
+                                means2, means3, stds0, stds1, stds2, stds3):
+        means = [means0, means1, means2, means3]
+        stds = [stds0, stds1, stds2, stds3]
+        px = (anchor_box[..., 0] + anchor_box[..., 2]) * 0.5
+        py = (anchor_box[..., 1] + anchor_box[..., 3]) * 0.5
+        pw = anchor_box[..., 2] - anchor_box[..., 0] + 1.0
+        ph = anchor_box[..., 3] - anchor_box[..., 1] + 1.0
+
+        gx = (ground_truth_box[..., 0] + ground_truth_box[..., 2]) * 0.5
+        gy = (ground_truth_box[..., 1] + ground_truth_box[..., 3]) * 0.5
+        gw = ground_truth_box[..., 2] - ground_truth_box[..., 0] + 1.0
+        gh = ground_truth_box[..., 3] - ground_truth_box[..., 1] + 1.0
+
+        eps = 1e-7
+        dx = (gx - px) / (pw + eps)
+        dy = (gy - py) / (ph + eps)
+        dw = torch.log(torch.abs(gw) / torch.abs(pw + eps))
+        dh = torch.log(torch.abs(gh) / torch.abs(ph + eps))
+        deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+        means = deltas.new_tensor(means).unsqueeze(0)
+        stds = deltas.new_tensor(stds).unsqueeze(0)
+        deltas = deltas.sub_(means) .div_(stds)
+
+        return deltas
+
+    def custom_op_exec(self, anchor_box, ground_truth_box, means0, means1,
+                       means2, means3, stds0, stds1, stds2, stds3):
+        output = self.npu_bounding_box_encode(anchor_box, ground_truth_box, means0, means1,
+                                              means2, means3, stds0, stds1, stds2, stds3)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, anchor_box, ground_truth_box, means0, means1,
+                    means2, means3, stds0, stds1, stds2, stds3):
+        output = ads.common.npu_bounding_box_encode(anchor_box, ground_truth_box, means0, means1,
+                                                   means2, means3, stds0, stds1, stds2, stds3)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_encode_shape_format_fp32(self):
+        input1 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]],
+                              dtype=torch.float32).to("npu")
+        input2 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]],
+                              dtype=torch.float32).to("npu")
+
+        npu_output = self.npu_op_exec(input1, input2, 0, 0, 0, 0,
+                                      0.1, 0.1, 0.2, 0.2)
+        custom_output = self.custom_op_exec(input1, input2, 0, 0, 0, 0,
+                                            0.1, 0.1, 0.2, 0.2)
+        self.assertRtolEqual(npu_output, custom_output, 1e-3)
+
+    def test_encode_shape_format_fp16(self):
+        input1_fp16 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]],
+                                   dtype=torch.float16).to("npu")
+        input2_fp16 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]],
+                                   dtype=torch.float16).to("npu")
+
+        npu_output = self.npu_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0,
+                                      0.1, 0.1, 0.2, 0.2)
+        custom_output = self.custom_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0,
+                                            0.1, 0.1, 0.2, 0.2)
+        self.assertRtolEqual(npu_output, custom_output, 1e-3)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/test_npu_broadcast.py b/tests/test_npu_broadcast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d44badc1f522c195cdf505c166e9600c55f79dc4
--- /dev/null
+++ b/tests/test_npu_broadcast.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+
+import torch_npu
+import ads.common
+from torch_npu.testing.testcase import TestCase, run_tests
+
+
+class TestNpuBroadcast(TestCase):
+    def custom_op_exec(self, input1, shape):
+        output = torch.broadcast_to(input1, shape)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, size):
+        output = ads.common.npu_broadcast(input1, size)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_npu_broadcast(self):
+        input1 = [
+            torch.tensor([1, 2, 3]).npu(),
+            torch.tensor([[1], [2], [3]]).npu()
+        ]
+        for item in input1:
+            custom_output = self.custom_op_exec(item, (3, 3))
+            npu_output = self.npu_op_exec(item, (3, 3))
+            self.assertRtolEqual(custom_output, npu_output)
+
+
+if __name__ == "__main__":
+    run_tests()