From 60eefa506a1baf24887039decd986e0699ed7c5c Mon Sep 17 00:00:00 2001 From: dingizyuan Date: Thu, 28 Aug 2025 19:19:24 +0800 Subject: [PATCH] =?UTF-8?q?SubmSparseConv3d=E7=AE=97=E5=AD=901x1x1?= =?UTF-8?q?=E5=8D=B7=E7=A7=AF=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- kernels/op_kernel/subm_sparse_conv3d_v2.cpp | 9 ++++-- mx_driving/csrc/SubmSparseCov3d.cpp | 6 ++-- tests/torch/test_subm_sparse_conv3d.py | 35 +++++++++++++++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/kernels/op_kernel/subm_sparse_conv3d_v2.cpp b/kernels/op_kernel/subm_sparse_conv3d_v2.cpp index 1b1ddc99..596cd403 100644 --- a/kernels/op_kernel/subm_sparse_conv3d_v2.cpp +++ b/kernels/op_kernel/subm_sparse_conv3d_v2.cpp @@ -19,6 +19,7 @@ constexpr uint8_t SRC_PARTTEN_1 = 4; constexpr uint8_t SRC_PARTTEN_2 = 5; constexpr uint8_t SRC_PARTTEN_3 = 6; constexpr uint8_t MAP_VAL_FLOAT_BUF_LENGTH = 3; +constexpr uint8_t K2_SIZE_1 = 1; constexpr uint8_t K2_SIZE_3 = 3; constexpr uint8_t K2_SIZE_5 = 5; constexpr int8_t K2_IDX_0 = 0; @@ -254,7 +255,9 @@ public: for (int8_t k0Idx = 0; k0Idx < k0_; k0Idx++) { innerKernelOffset = k0Idx * k1_ * k2Aligned_; for (int8_t k1Idx = 0; k1Idx < k1_; k1Idx++) { - if (k2_ == K2_SIZE_3) { + if (k2_ == K2_SIZE_1) { + ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_0, mapValLocal_.GetValue(innerKernelOffset)); + } else if (k2_ == K2_SIZE_3) { ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_0, mapValLocal_.GetValue(innerKernelOffset)); ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_1, mapValLocal_.GetValue(innerKernelOffset + MAP2_OFFSET_1)); ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_2, mapValLocal_.GetValue(innerKernelOffset + MAP2_OFFSET_2)); @@ -302,7 +305,9 @@ public: int8_t k1Idx = mapIdx % k1Aligned_; int32_t map2Offset = map1Val * spatialShape2_ + spatial2BaseIdx; - if (k2_ == K2_SIZE_3) { + if (k2_ == K2_SIZE_1) { + ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_0, map2GM_.GetValue(map2Offset)); + } else if (k2_ == K2_SIZE_3) { ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_0, map2GM_.GetValue(map2Offset)); ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_1, map2GM_.GetValue(map2Offset + MAP2_OFFSET_1)); ProcessOnePoint(i, k0Idx, k1Idx, K2_IDX_2, map2GM_.GetValue(map2Offset + MAP2_OFFSET_2)); diff --git a/mx_driving/csrc/SubmSparseCov3d.cpp b/mx_driving/csrc/SubmSparseCov3d.cpp index 2940075b..779058ad 100644 --- a/mx_driving/csrc/SubmSparseCov3d.cpp +++ b/mx_driving/csrc/SubmSparseCov3d.cpp @@ -19,6 +19,7 @@ namespace { constexpr size_t TOTAL_CAPACITY = 8; + constexpr uint8_t KERNEL_SIZE_1 = 1; constexpr uint8_t KERNEL_SIZE_3 = 3; constexpr uint8_t KERNEL_SIZE_5 = 5; constexpr uint32_t KERNEL_SIZE_IDX_0 = 0; @@ -62,9 +63,10 @@ std::tuple npu_subm_sparse_conv3d_v2(const at::Tensor& f { TORCH_CHECK_NPU(feature); TORCH_CHECK_NPU(indices); - TORCH_CHECK((kernel_size[KERNEL_SIZE_IDX_0] == KERNEL_SIZE_3 && kernel_size[KERNEL_SIZE_IDX_1] == KERNEL_SIZE_3 && kernel_size[KERNEL_SIZE_IDX_2] == KERNEL_SIZE_3) || + TORCH_CHECK((kernel_size[KERNEL_SIZE_IDX_0] == KERNEL_SIZE_1 && kernel_size[KERNEL_SIZE_IDX_1] == KERNEL_SIZE_1 && kernel_size[KERNEL_SIZE_IDX_2] == KERNEL_SIZE_1) || + (kernel_size[KERNEL_SIZE_IDX_0] == KERNEL_SIZE_3 && kernel_size[KERNEL_SIZE_IDX_1] == KERNEL_SIZE_3 && kernel_size[KERNEL_SIZE_IDX_2] == KERNEL_SIZE_3) || (kernel_size[KERNEL_SIZE_IDX_0] == KERNEL_SIZE_5 && kernel_size[KERNEL_SIZE_IDX_1] == KERNEL_SIZE_5 && kernel_size[KERNEL_SIZE_IDX_2] == KERNEL_SIZE_5), - "kernel size current only support (3, 3, 3) and (5, 5, 5) but got: (", + "kernel size current only support (1, 1, 1), (3, 3, 3) and (5, 5, 5) but got: (", kernel_size[KERNEL_SIZE_IDX_0], ", ", kernel_size[KERNEL_SIZE_IDX_1], ", ", kernel_size[KERNEL_SIZE_IDX_2], ")"); auto indices_size = indices.sizes(); diff --git a/tests/torch/test_subm_sparse_conv3d.py b/tests/torch/test_subm_sparse_conv3d.py index 741f7ecd..4cf778c1 100644 --- a/tests/torch/test_subm_sparse_conv3d.py +++ b/tests/torch/test_subm_sparse_conv3d.py @@ -254,6 +254,41 @@ class TestSubmSparseConv3d(TestCase): res, golden = get_output(num_points, batch_size, in_channels, out_channels, kernel_size, out_spatial_shape, dtype) self.assertRtolEqual(golden, res, 1e-3, 1e-3) + def test_1x1_small_spatial_shape(self): + num_points = [20000] + out_spatial_shape = [1180, 180, 5] + in_channels = 32 + out_channels = 64 + kernel_size = 1 + batch_size = len(num_points) + + res, golden = get_output(num_points, batch_size, in_channels, out_channels, kernel_size, out_spatial_shape) + self.assertRtolEqual(golden, res, 1e-3, 1e-3) + + def test_1x1_small_spatial_shape_fp16(self): + num_points = [20000] + out_spatial_shape = [1180, 180, 5] + in_channels = 32 + out_channels = 64 + kernel_size = 1 + batch_size = len(num_points) + dtype = torch.float16 + + res, golden = get_output(num_points, batch_size, in_channels, out_channels, kernel_size, out_spatial_shape, dtype) + self.assertRtolEqual(golden, res, 1e-3, 1e-3) + + def test_1x1_large_spatial_shape_fp16(self): + num_points = [10000] + out_spatial_shape = [3280, 2480, 500] + in_channels = 128 + out_channels = 256 + kernel_size = 1 + batch_size = len(num_points) + dtype = torch.float16 + + res, golden = get_output(num_points, batch_size, in_channels, out_channels, kernel_size, out_spatial_shape, dtype) + self.assertRtolEqual(golden, res, 1e-3, 1e-3) + if __name__ == "__main__": np.random.seed(100) run_tests() \ No newline at end of file -- Gitee