diff --git a/ads/common/__init__.py b/ads/common/__init__.py
index 021418f12d15c708b799729a39fd351db043dd69..d13daf5c50776ae33ae2045fb0fc64e1f0ac9bd0 100644
--- a/ads/common/__init__.py
+++ b/ads/common/__init__.py
@@ -22,3 +22,4 @@ from .ops.npu_batch_nms import npu_batch_nms
 from .ops.npu_confusion_transpose import npu_confusion_transpose
 from .ops.npu_broadcast import npu_broadcast
 from .ops.npu_moe_tutel import npu_moe_tutel
+from .ops.iou import npu_iou
diff --git a/ads/common/ops/csrc/IouKernelNpu.cpp b/ads/common/ops/csrc/IouKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ff33d4f54dc6788e6530d3f3c071e896872b385
--- /dev/null
+++ b/ads/common/ops/csrc/IouKernelNpu.cpp
@@ -0,0 +1,59 @@
+// Copyright (c) 2023 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/core/npu/NPUFormat.h"
+#include "functions.h"
+
+at::Tensor npu_iou(
+    const at::Tensor& bboxes,
+    const at::Tensor& gtboxes,
+    int64_t mode)
+{
+    at::Tensor bboxes_fp16 = bboxes;
+    if (bboxes.scalar_type() != at::ScalarType::Half) {
+        bboxes_fp16 = bboxes.to(at::kHalf);
+    }
+    at::Tensor gtboxes_fp16 = gtboxes;
+    if (gtboxes.scalar_type() != at::ScalarType::Half) {
+        gtboxes_fp16 = gtboxes.to(at::kHalf);
+    }
+
+    auto output_size = {gtboxes.size(0), bboxes.size(0)};
+    at::Tensor overlap = at_npu::native::empty_with_format(
+        output_size,
+        bboxes_fp16.options(),
+        at_npu::native::get_npu_format(bboxes));
+    string mode_str = "iou";
+    if (mode == 1) {
+        mode_str = "iof";
+    }
+
+    at_npu::native::OpCommand cmd;
+    cmd.Name("Iou")
+        .Input(bboxes_fp16)
+        .Input(gtboxes_fp16)
+        .Output(overlap)
+        .Attr("mode", mode_str)
+        .Attr("eps", static_cast<float>(0.01))
+        .Run();
+
+    if (overlap.scalar_type() != bboxes.scalar_type()) {
+        overlap = overlap.to(bboxes.scalar_type());
+    }
+    return overlap;
+}
diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h
index f713b98400c6d2168727ff675f61b2c73a086b74..b18b6c5d70e65af6bfff043cfe7b753890759390 100644
--- a/ads/common/ops/csrc/functions.h
+++ b/ads/common/ops/csrc/functions.h
@@ -132,5 +132,6 @@ at::Tensor npu_moe_tutel_gate_backward(
     const at::Tensor &y_grad,
     const at::Tensor &indices,
     const at::Tensor &locations);
+at::Tensor npu_iou(const at::Tensor& bboxes, const at::Tensor& gtboxes, int64_t mode);
 
 #endif // __FUNCTIONS_H__
diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp
index c91e4093ea2c93a19f213e43557b3cdd6d25cb1e..1a5670cd8ee1c7570ae1bcaef2e64d30cd1e1e9c 100644
--- a/ads/common/ops/csrc/pybind.cpp
+++ b/ads/common/ops/csrc/pybind.cpp
@@ -71,4 +71,7 @@ void init_common(pybind11::module &m)
     m.def("npu_moe_tutel", &npu_moe_tutel, "npu_moe_tutel NPU version");
     m.def("npu_moe_tutel_data_backward", &npu_moe_tutel_data_backward, "npu_moe_tutel_data_backward NPU version");
     m.def("npu_moe_tutel_gate_backward", &npu_moe_tutel_gate_backward, "npu_moe_tutel_gate_backward NPU version");
+    
+    // iou
+    m.def("npu_iou", &npu_iou);
 }
diff --git a/ads/common/ops/iou.py b/ads/common/ops/iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e95086f7507e4ed69c2ad00078b74e7c0e5ab14
--- /dev/null
+++ b/ads/common/ops/iou.py
@@ -0,0 +1,5 @@
+import torch
+import torch_npu
+import ads_c
+
+npu_iou = ads_c.npu_iou
diff --git a/tests/test_npu_iou.py b/tests/test_npu_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..150d8a35d37b9aca5e2c6dfcc9f1aae23ca74c07
--- /dev/null
+++ b/tests/test_npu_iou.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import torch
+
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import ads.common
+
+
+class TestNpuIou(TestCase):
+    def ads_op_exec(self, bboxes, gtboxes, mode=0):
+        output = ads.common.npu_iou(bboxes, gtboxes, mode)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, bboxes, gtboxes, mode=0):
+        output = torch_npu.npu_iou(bboxes, gtboxes, mode)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_iou_fp16(self):
+        bboxes = torch.tensor([[0, 0, 10, 10],
+                               [10, 10, 20, 20],
+                               [32, 32, 38, 42]], dtype=torch.float16).to("npu")
+        gtboxes = torch.tensor([[0, 0, 10, 20],
+                                [0, 10, 10, 10],
+                                [10, 10, 20, 20]], dtype=torch.float16).to("npu")
+
+        output_npu = self.npu_op_exec(bboxes, gtboxes, 1)
+        output_ads = self.ads_op_exec(bboxes, gtboxes, 1)
+        self.assertRtolEqual(output_npu, output_ads)
+
+        output_npu = self.npu_op_exec(bboxes, gtboxes)
+        output_ads = self.ads_op_exec(bboxes, gtboxes)
+        self.assertRtolEqual(output_npu, output_ads)
+
+    def test_iou_fp16_pt(self):
+        bboxes = torch.tensor([[1, 2, 3, 4],
+                               [5, 6, 7, 8],
+                               [9, 10, 11, 12],
+                               [13, 14, 15, 16]], dtype=torch.float16).npu()
+        gtboxes = torch.tensor([[1, 2, 3, 4],
+                                [5, 6, 7, 8]], dtype=torch.float16).npu()
+
+        output_npu = self.npu_op_exec(bboxes, gtboxes, 1)
+        output_ads = self.ads_op_exec(bboxes, gtboxes, 1)
+        self.assertRtolEqual(output_npu, output_ads)
+
+        output_npu = self.npu_op_exec(bboxes, gtboxes)
+        output_ads = self.ads_op_exec(bboxes, gtboxes)
+        self.assertRtolEqual(output_npu, output_ads)
+
+
+if __name__ == "__main__":
+    run_tests()