diff --git a/ads/common/__init__.py b/ads/common/__init__.py index c4dcc62e68706731525794c75d57c26786658613..8e59829b5732c061d84fec33f3401d83ca9257a3 100644 --- a/ads/common/__init__.py +++ b/ads/common/__init__.py @@ -14,3 +14,10 @@ from .ops.silu import npu_silu from .ops.silu import npu_silu_ from .ops.rotary_mul import npu_rotary_mul from .ops.npu_abs import npu_abs +from .ops.fast_gelu import fast_gelu +from .ops.npu_anchor_response_flags import npu_anchor_response_flags +from .ops.npu_bounding_box_decode import npu_bounding_box_decode +from .ops.npu_bounding_box_encode import npu_bounding_box_encode +from .ops.npu_batch_nms import npu_batch_nms +from .ops.npu_confusion_transpose import npu_confusion_transpose +from .ops.npu_broadcast import npu_broadcast diff --git a/ads/common/ops/csrc/AnchorResponseFlagsKernelNpu.cpp b/ads/common/ops/csrc/AnchorResponseFlagsKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f414633cc6ba78e6a9fdb8cdb80dceb64f23c8e4 --- /dev/null +++ b/ads/common/ops/csrc/AnchorResponseFlagsKernelNpu.cpp @@ -0,0 +1,72 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/OpCommand.h" +#include "common.h" + +namespace { +c10::SmallVector infersize_npu_anchor_response_flags( + at::IntArrayRef featmap_size, + int64_t num_base_anchors) +{ + int64_t output_value = featmap_size[0] * featmap_size[1] * num_base_anchors; + c10::SmallVector output_size = {output_value}; + return output_size; +} + +inline void anchor_response_flags_check( + const at::Tensor& self, + at::IntArrayRef featmap_size, + at::IntArrayRef stride) +{ + TORCH_CHECK( + featmap_size.size() == 2, + "expected feat_map_size equals to 2, but got size ", + featmap_size.size()); + TORCH_CHECK( + self.dim() == 2 && self.size(1) == 4, + "Non-empty 2D gt_bboxes tensor expected but got a tensor with sizes ", + self.sizes()); + TORCH_CHECK( + self.scalar_type() == at::kHalf || self.scalar_type() == at::kFloat, + "float16 or float32 tensor expected but got a tensor with dtype: ", + self.scalar_type()); +} +} // namespace + +at::Tensor npu_anchor_response_flags( + const at::Tensor& self, + at::IntArrayRef featmap_size, + at::IntArrayRef stride, + int64_t num_base_anchors) +{ + anchor_response_flags_check(self, featmap_size, stride); + auto output_size = infersize_npu_anchor_response_flags(featmap_size, num_base_anchors); + auto options = self.options().dtype(at::kByte); + at::Tensor result = at::empty(output_size, options); + + at::Tensor self_cp = self.to(at::kFloat); + + at_npu::native::OpCommand cmd; + cmd.Name("AnchorResponseFlags") + .Input(self_cp) + .Output(result) + .Attr("featmap_size", featmap_size) + .Attr("strides", stride) + .Attr("num_base_anchors", num_base_anchors) + .Run(); + return result; +} diff --git a/ads/common/ops/csrc/BatchNms.cpp b/ads/common/ops/csrc/BatchNms.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a7051437f9fedc0279108ff783e9637499ee6e74 --- /dev/null +++ b/ads/common/ops/csrc/BatchNms.cpp @@ -0,0 +1,49 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/OpCommand.h" +#include "common.h" + +std::tuple npu_batch_nms( + const at::Tensor& self, + const at::Tensor& scores, + double score_threshold, + double iou_threshold, + int64_t max_size_per_class, + int64_t max_total_size, + bool change_coordinate_frame, + bool transpose_box) +{ + at::Tensor nmsed_boxes = at::empty({self.size(0), max_total_size, 4}, self.options()); + at::Tensor nmsed_scores = at::empty({self.size(0), max_total_size}, self.options()); + at::Tensor nmsed_classes = at::empty({self.size(0), max_total_size}, self.options()); + at::Tensor nmsed_num = at::empty({self.size(0)}, self.options().dtype(at::kInt)); + at_npu::native::OpCommand cmd; + cmd.Name("BatchMultiClassNonMaxSuppression") + .Input(self) + .Input(scores) + .Output(nmsed_boxes) + .Output(nmsed_scores) + .Output(nmsed_classes) + .Output(nmsed_num) + .Attr("score_threshold", static_cast(score_threshold)) + .Attr("iou_threshold", static_cast(iou_threshold)) + .Attr("max_size_per_class", max_size_per_class) + .Attr("max_total_size", max_total_size) + .Attr("change_coordinate_frame", change_coordinate_frame) + .Attr("transpose_box", transpose_box) + .Run(); + return std::tie(nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num); +} diff --git a/ads/common/ops/csrc/BoundingBoxDecodeKernelNpu.cpp b/ads/common/ops/csrc/BoundingBoxDecodeKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..85fc07643d9059a23d6a549f08451f5cd5cbdaac --- /dev/null +++ b/ads/common/ops/csrc/BoundingBoxDecodeKernelNpu.cpp @@ -0,0 +1,57 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "torch_npu/csrc/framework/OpCommand.h" +#include "common.h" + +at::Tensor npu_bounding_box_decode( + const at::Tensor& rois, + const at::Tensor& deltas, + double means0, + double means1, + double means2, + double means3, + double stds0, + double stds1, + double stds2, + double stds3, + at::IntArrayRef max_shape, + double wh_ratio_clip) +{ + c10::SmallVector output_size = {rois.size(0), 4}; + at::Tensor result = at::empty(output_size, rois.options()); + c10::SmallVector means = { + static_cast(means0), + static_cast(means1), + static_cast(means2), + static_cast(means3)}; + c10::SmallVector stds = { + static_cast(stds0), + static_cast(stds1), + static_cast(stds2), + static_cast(stds3)}; + at_npu::native::OpCommand cmd; + cmd.Name("BoundingBoxDecode") + .Input(rois) + .Input(deltas) + .Output(result) + .Attr("means", means) + .Attr("stds", stds) + .Attr("max_shape", max_shape) + .Attr("wh_ratio_clip", static_cast(wh_ratio_clip)) + .Run(); + return result; +} diff --git a/ads/common/ops/csrc/BoundingBoxEncodeKernelNpu.cpp b/ads/common/ops/csrc/BoundingBoxEncodeKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aa5bad77d2a6b5f8cc3ffc63b917a952b1f97eec --- /dev/null +++ b/ads/common/ops/csrc/BoundingBoxEncodeKernelNpu.cpp @@ -0,0 +1,51 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/OpCommand.h" +#include "common.h" + +at::Tensor npu_bounding_box_encode( + const at::Tensor& anchor_box, + const at::Tensor& ground_truth_box, + double means0, + double means1, + double means2, + double means3, + double stds0, + double stds1, + double stds2, + double stds3) +{ + at::Tensor result = at::empty({anchor_box.size(0), 4}, anchor_box.options()); + c10::SmallVector means = { + static_cast(means0), + static_cast(means1), + static_cast(means2), + static_cast(means3)}; + c10::SmallVector stds = { + static_cast(stds0), + static_cast(stds1), + static_cast(stds2), + static_cast(stds3)}; + at_npu::native::OpCommand cmd; + cmd.Name("BoundingBoxEncode") + .Input(anchor_box) + .Input(ground_truth_box) + .Output(result) + .Attr("means", means) + .Attr("stds", stds) + .Run(); + return result; +} diff --git a/ads/common/ops/csrc/BroadCastKernelNpu.cpp b/ads/common/ops/csrc/BroadCastKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b48580342553a5c1b2711dff770b24ee38fe13e1 --- /dev/null +++ b/ads/common/ops/csrc/BroadCastKernelNpu.cpp @@ -0,0 +1,49 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/OpCommand.h" + + +namespace { +at::Tensor& npu_broadcast_out_nocheck(at::Tensor& result, const at::Tensor& self, at::IntArrayRef size) +{ + at_npu::native::OpCommand cmd; + cmd.Name("BroadcastTo") + .Input(self) + .Input(size) + .Output(result) + .Run(); + return result; +} +} // namespace + +at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result) +{ + npu_broadcast_out_nocheck(result, self, size); + + return result; +} + +at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size) +{ + at::Tensor self_cp = self.dtype() == at::kBool ? self.to(at::kInt) : self; + at::Tensor result = at::empty(size, self_cp.options()); + npu_broadcast_out_nocheck(result, self_cp, size); + + if (self.dtype() == at::kBool) { + result = result.to(at::kBool); + } + return result; +} diff --git a/ads/common/ops/csrc/ConfusionTransposeKernelNpu.cpp b/ads/common/ops/csrc/ConfusionTransposeKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a12d1d7c87d2be696e4e89adbea4dca5362836d9 --- /dev/null +++ b/ads/common/ops/csrc/ConfusionTransposeKernelNpu.cpp @@ -0,0 +1,97 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/OpCommand.h" +#include "common.h" + +at::Tensor npu_confusion_transpose( + const at::Tensor& self, + at::IntArrayRef perm, + at::IntArrayRef shape, + bool transpose_first) +{ + c10::SmallVector output_size; + if (transpose_first) { + output_size = array_to_small_vector(shape); + } else { + auto shape_size = shape.size(); + for (uint i = 0; i < perm.size(); i++) { + TORCH_CHECK(shape_size > perm[i], "npu_confusion_transpose input invalid, " + "shape has size ", + shape_size, " but perm[i] is, ", perm[i]); + output_size.emplace_back(shape[perm[i]]); + } + } + + at::Tensor result = at::empty(output_size, self.options()); + at_npu::native::OpCommand cmd; + cmd.Name("ConfusionTransposeD") + .Input(self) + .Output(result) + .Attr("perm", perm) + .Attr("shape", shape) + .Attr("transpose_first", transpose_first) + .Run(); + + return result; +} + +void check_confusion_transpose_perm(at::IntArrayRef perm, at::IntArrayRef shape) +{ + auto input_dim = shape.size(); + TORCH_CHECK(perm.size() == input_dim, "The length of perm should be the same as shape."); + std::vector seen(input_dim); + for (const auto i : c10::irange(input_dim)) { + auto dim = at::maybe_wrap_dim(perm[i], input_dim); + TORCH_CHECK(!seen[dim], "Repeated dim in perm"); + seen[dim] = true; + } +} + +at::Tensor npu_confusion_transpose_backward( + const at::Tensor& grad, + at::IntArrayRef perm, + at::IntArrayRef shape, + bool transpose_first) +{ + c10::SmallVector svec_shape; + if (transpose_first) { + svec_shape = array_to_small_vector(shape); + } else { + check_confusion_transpose_perm(perm, shape); + for (int i = 0; i < perm.size(); i++) { + svec_shape.emplace_back(shape[perm[i]]); + } + } + std::vector vec_perm; + int64_t perm_len = perm.size(); + int64_t temp_perm[perm_len] = {0}; + for (int64_t i = 0; i < perm_len; i++) { + temp_perm[perm[i]] = i; + } + vec_perm = std::vector(temp_perm, temp_perm+perm_len); + perm = at::IntArrayRef(vec_perm); + at::Tensor result = at::empty(shape, grad.options()); + + at_npu::native::OpCommand cmd; + cmd.Name("ConfusionTransposeD") + .Input(grad) + .Output(result) + .Attr("perm", perm) + .Attr("shape", svec_shape) + .Attr("transpose_first", transpose_first) + .Run(); + return result; +} diff --git a/ads/common/ops/csrc/FastGeluKernelNpu.cpp b/ads/common/ops/csrc/FastGeluKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a56dcb7a0674cf7c33f84bd261675f04d340db60 --- /dev/null +++ b/ads/common/ops/csrc/FastGeluKernelNpu.cpp @@ -0,0 +1,52 @@ +// Copyright (c) 2023 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "torch_npu/csrc/framework/OpCommand.h" + +namespace { +at::Tensor& fast_gelu_backward_npu_nocheck( + at::Tensor& grad_input, + const at::Tensor& grad, + const at::Tensor& self) +{ + at_npu::native::OpCommand cmd; + cmd.Name("FastGeluGrad") + .Input(grad) + .Input(self) + .Output(grad_input) + .Run(); + return grad_input; +} +} // namespace + +at::Tensor npu_fast_gelu(const at::Tensor& self) +{ + at::Tensor result = at::empty(self.sizes(), self.options()); + at_npu::native::OpCommand cmd; + cmd.Name("FastGelu") + .Input(self) + .Output(result) + .Run(); + return result; +} + +at::Tensor npu_fast_gelu_backward( + const at::Tensor& grad, + const at::Tensor& self) +{ + at::Tensor grad_input = at::empty(self.sizes(), self.options()); + fast_gelu_backward_npu_nocheck(grad_input, grad, self); + return grad_input; +} diff --git a/ads/common/ops/csrc/NpuSilu.cpp b/ads/common/ops/csrc/NpuSilu.cpp index 0b81c0f426f6b0d0a7954666ad141ae9eac4e502..4f62cad5862989d2a3259b2a6852b749767383a8 100644 --- a/ads/common/ops/csrc/NpuSilu.cpp +++ b/ads/common/ops/csrc/NpuSilu.cpp @@ -1,18 +1,7 @@ -#include - #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" #include "common.h" -using torch::autograd::AutogradContext; -using torch::autograd::Function; -using npu_preparation = at_npu::native::OpPreparation; -using npu_utils = at_npu::native::NpuUtils; -using tensor_list = std::vector; - at::Tensor &silu_out_npu_nocheck(at::Tensor &result, const at::Tensor &self) { at_npu::native::OpCommand cmd; @@ -80,4 +69,4 @@ at::Tensor &npu_silu_(at::Tensor &self) { silu_out_npu(self, self); return self; -} \ No newline at end of file +} diff --git a/ads/common/ops/csrc/RotaryMulKernelNpu.cpp b/ads/common/ops/csrc/RotaryMulKernelNpu.cpp index 3e814e4e96956c28c1f4ef29c949db742da1e919..055693091b9e88d8c1b66cfe078cee143b8e43fc 100644 --- a/ads/common/ops/csrc/RotaryMulKernelNpu.cpp +++ b/ads/common/ops/csrc/RotaryMulKernelNpu.cpp @@ -14,18 +14,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" -#include "torch_npu/csrc/aten/CustomFunctions.h" #include "functions.h" #include "common.h" -using npu_preparation = at_npu::native::OpPreparation; -using torch::autograd::Function; -using torch::autograd::AutogradContext; using tensor_tuple = std::tuple; namespace { @@ -42,7 +34,7 @@ at::Tensor &rotary_mul_nocheck(at::Tensor &y, const at::Tensor &x, const at::Ten return y; } -tensor_tuple rotary_mul_backward_nocheck(at::Tensor &dx, at::Tensor &dr1, at::Tensor &dr2, const at::Tensor &x, +tensor_tuple rotary_mul_backward_nocheck(at::Tensor &dx, at::Tensor &dr1, at::Tensor &dr2, const at::Tensor &x, const at::Tensor &r1, const at::Tensor &r2, const at::Tensor &dy) { TORCH_CHECK(x.dim() == 4, "The dim of input tensor [x] shoule equal to four."); diff --git a/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp b/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp index db949fc984bb204bd3fd7ff8d3700f429b7b5bdd..0e8aa592f4f7285c6b8f78cc89125e5722615b20 100644 --- a/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp +++ b/ads/common/ops/csrc/RotatedBoxDecodeKernelNpu.cpp @@ -15,13 +15,8 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" -using npu_preparation = at_npu::native::OpPreparation; -using npu_utils = at_npu::native::NpuUtils; at::Tensor npu_rotated_box_decode(const at::Tensor &self, const at::Tensor &deltas, const at::Tensor &weight) { diff --git a/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp b/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp index cfe515bac88e0d668483c1a2e88e1172c34d3db3..865b994b622f123e71875831977ef48ffeb2df54 100644 --- a/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp +++ b/ads/common/ops/csrc/RotatedBoxEncodeKernelNpu.cpp @@ -15,13 +15,8 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" -using npu_preparation = at_npu::native::OpPreparation; - at::Tensor npu_rotated_box_encode( const at::Tensor &self, const at::Tensor >Box, diff --git a/ads/common/ops/csrc/RotatedIouKernelNpu.cpp b/ads/common/ops/csrc/RotatedIouKernelNpu.cpp index 7c8c334a8ec52fd01d7ff5c5a71b85bdd1f383e6..dc94943dcda69ae9cdcb27d330bc1c8ae53212e9 100644 --- a/ads/common/ops/csrc/RotatedIouKernelNpu.cpp +++ b/ads/common/ops/csrc/RotatedIouKernelNpu.cpp @@ -15,14 +15,8 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" -#include "torch_npu/csrc/aten/CustomFunctions.h" #include "functions.h" -using npu_preparation = at_npu::native::OpPreparation; - namespace { at::Tensor &rotated_iou_npu_nocheck( at::Tensor &iou, diff --git a/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp b/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp index ac476c6299673125071391f1549796457f0db397..0a957ca9b8986df6e4c4b3c9cbd8330896e3bd50 100644 --- a/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp +++ b/ads/common/ops/csrc/RotatedOverlapsKernelNpu.cpp @@ -15,14 +15,8 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" -#include "torch_npu/csrc/aten/CustomFunctions.h" #include "functions.h" -using npu_preparation = at_npu::native::OpPreparation; - namespace { at::Tensor &rotated_overlaps_npu_nocheck( at::Tensor &overlaps, diff --git a/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp b/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp index c4e943842d775364a5ba471deae2589945cf1f48..f3b11664cc4fb99fd03dbaa7b7dd4f3b8be6e8c7 100644 --- a/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp +++ b/ads/common/ops/csrc/ScatterMaxKernelNpu.cpp @@ -1,16 +1,8 @@ -#include - #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" +#include "common.h" -using namespace at; using namespace std; -using torch::autograd::Function; -using torch::autograd::AutogradContext; -using tensor_list = std::vector; - std::tuple npu_scatter_max( const at::Tensor& updates, const at::Tensor& indices, @@ -21,7 +13,7 @@ std::tuple npu_scatter_max( sizes[0] = indices.max().item().toLong() + 1; at::Tensor result = out.value_or(at::zeros(sizes, updates.options().dtype(at::kFloat))); - at::Tensor argmax = at_npu::native::OpPreparation::ApplyTensor(result, result.options().dtype(at::kInt)); + at::Tensor argmax = at::empty(result.sizes(), result.options().dtype(at::kInt)); at_npu::native::OpCommand cmd; cmd.Name("ScatterMaxWithArgmax") @@ -37,7 +29,7 @@ std::tuple npu_scatter_max( at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segment_ids, const at::Tensor& num_segments) { - c10::SmallVector output_size; + c10::SmallVector output_size; auto num_segments_value = num_segments.item().toLong(); output_size.push_back(num_segments_value); @@ -47,7 +39,7 @@ at::Tensor npu_scatter_max_backward(const at::Tensor& x, const at::Tensor& segme copy(x_sizes.begin() + segment_ids_dims, x_sizes.end(), std::back_inserter(output_size)); - at::Tensor out = at_npu::native::OpPreparation::ApplyTensor(x, output_size); + at::Tensor out = at::empty(output_size, x.options()); at_npu::native::OpCommand cmd; cmd.Name("UnsortedSegmentSum") .Input(x) diff --git a/ads/common/ops/csrc/ScatterV1KernelNpu.cpp b/ads/common/ops/csrc/ScatterV1KernelNpu.cpp index 155ea3839ebca5494976551861a731ac0465e7cc..f96d460819aecdbf0acee953604fe5766d300c88 100644 --- a/ads/common/ops/csrc/ScatterV1KernelNpu.cpp +++ b/ads/common/ops/csrc/ScatterV1KernelNpu.cpp @@ -15,12 +15,8 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" -using npu_preparation = at_npu::native::OpPreparation; at::Tensor npu_scatter(const at::Tensor &self, const at::Tensor &indices, const at::Tensor &updates, int64_t dim) { diff --git a/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp b/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp index 5fb1139c5ff1d398c63bc6470e4dbf543f062f19..95f4c3ffaad06f6b3db554cd73ff4777d1216300 100644 --- a/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp +++ b/ads/common/ops/csrc/SignBitsPackKernelNpu.cpp @@ -15,12 +15,8 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" -using npu_preparation = at_npu::native::OpPreparation; at::Tensor npu_sign_bits_pack(const at::Tensor &self, int64_t size) { diff --git a/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp b/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp index e7a35680b65194544d646cb6a584c35d62b510f7..27ae440bed26bf974991218bc8294ac126d2fea1 100644 --- a/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp +++ b/ads/common/ops/csrc/SignBitsUnpackKernelNpu.cpp @@ -15,13 +15,9 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" #include "common.h" -using npu_preparation = at_npu::native::OpPreparation; at::Tensor npu_sign_bits_unpack_compute( const at::Tensor &input, diff --git a/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp b/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp index e936c81945a5fc3268ec6a47dd715b493017d2e2..cc8f95dff0b0072324b875630059a7fbfa6fbf5c 100644 --- a/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp +++ b/ads/common/ops/csrc/SoftmaxCrossEntropyWithLogitsKernelNpu.cpp @@ -14,19 +14,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" #include "common.h" -using npu_preparation = at_npu::native::OpPreparation; -using torch::autograd::AutogradContext; -using torch::autograd::Function; -using tensor_list = std::vector; - namespace { std::tuple softmax_cross_entropy_with_logits_out_nocheck( at::Tensor &result, diff --git a/ads/common/ops/csrc/StrideAddKernelNpu.cpp b/ads/common/ops/csrc/StrideAddKernelNpu.cpp index ebcfbfda634f8341c0f17989c136ee7b78a1f326..47922f62715d095724e7721528a7f085dcd498fd 100644 --- a/ads/common/ops/csrc/StrideAddKernelNpu.cpp +++ b/ads/common/ops/csrc/StrideAddKernelNpu.cpp @@ -14,15 +14,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" #include "common.h" -using npu_preparation = at_npu::native::OpPreparation; namespace { at::Tensor &stride_add_out_npu_nocheck( diff --git a/ads/common/ops/csrc/TransposeKernelNpu.cpp b/ads/common/ops/csrc/TransposeKernelNpu.cpp index ad9d2e972b8d5b789b06a08ec774ef8e10b29c5e..2e8705c2a79c10e15c1cfed71cfdaf2c1ce4d754 100644 --- a/ads/common/ops/csrc/TransposeKernelNpu.cpp +++ b/ads/common/ops/csrc/TransposeKernelNpu.cpp @@ -15,14 +15,9 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "functions.h" #include "common.h" -using npu_utils = at_npu::native::NpuUtils; - namespace { at::Tensor &npu_transpose_out_nocheck( at::Tensor &result, diff --git a/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp b/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp index f3cd4201329c5624d7f4098f3f505c29375c67e2..df02a325f4cb3fda9ab8bb2b8ad807629be6d7f5 100644 --- a/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp +++ b/ads/common/ops/csrc/YoloBoxesEncodeKernelNpu.cpp @@ -15,14 +15,9 @@ // limitations under the License. #include "torch_npu/csrc/framework/OpCommand.h" -#include "torch_npu/csrc/framework/utils/OpPreparation.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" -#include "torch_npu/csrc/aten/CustomFunctions.h" #include "functions.h" #include "common.h" -using npu_preparation = at_npu::native::OpPreparation; namespace { inline void yolo_boxes_encode_check( diff --git a/ads/common/ops/csrc/common.cpp b/ads/common/ops/csrc/common.cpp index 8e4b30373a39880632f0526d762f34e493a50896..f6f9cc495143311ed326cf79f3391bd827f5c1fa 100644 --- a/ads/common/ops/csrc/common.cpp +++ b/ads/common/ops/csrc/common.cpp @@ -1,12 +1,9 @@ #include #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" -#include "torch_npu/csrc/framework/utils/NpuUtils.h" #include "torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h" -#include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "third_party/acl/inc/acl/acl_base.h" #include "common.h" -using npu_utils = at_npu::native::NpuUtils; using CalcuOpUtil = at_npu::native::CalcuOpUtil; #define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \ @@ -192,4 +189,4 @@ bool check_match(const at::Tensor &self) void format_fresh_view(at::Tensor &x, const at::Tensor &y) { x.copy_(y); -} \ No newline at end of file +} diff --git a/ads/common/ops/csrc/common.h b/ads/common/ops/csrc/common.h index 49a7565312288dafc64ac4103909da514b2ddb69..95c2b5a194029b37893ce7b45651ea3cb9d315d2 100644 --- a/ads/common/ops/csrc/common.h +++ b/ads/common/ops/csrc/common.h @@ -1,3 +1,5 @@ +#ifndef __COMMON_H__ +#define __COMMON_H__ #include #include #include @@ -26,4 +28,6 @@ c10::SmallVector convert_array_to_vector(c10::IntArrayRef intArray); c10::SmallVector infersize_stride_add(c10::IntArrayRef shape1_, c10::IntArrayRef shape2_); c10::SmallVector transpose_npu_output_size(const at::Tensor &self, c10::IntArrayRef perm); bool check_match(const at::Tensor &self); -void format_fresh_view(at::Tensor &x, const at::Tensor &y); \ No newline at end of file +void format_fresh_view(at::Tensor &x, const at::Tensor &y); + +#endif // __COMMON_H__ diff --git a/ads/common/ops/csrc/functions.h b/ads/common/ops/csrc/functions.h index 243774ab2a4fd992c4bb3676d7f9c30f4d1cc7b1..0afa2edce5b9de5e41969fbd8b7d91855377f509 100644 --- a/ads/common/ops/csrc/functions.h +++ b/ads/common/ops/csrc/functions.h @@ -1,3 +1,19 @@ +// Copyright (c) 2023, Huawei Technologies.All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef __FUNCTIONS_H__ +#define __FUNCTIONS_H__ + #include #include #include @@ -43,3 +59,62 @@ at::Tensor npu_rotary_mul(const at::Tensor &self, const at::Tensor &r1, const at at::Tensor npu_silu(const at::Tensor& self); at::Tensor& npu_silu_(at::Tensor& self); at::Tensor npu_abs(const at::Tensor& self); +at::Tensor npu_fast_gelu_backward(const at::Tensor& grad, const at::Tensor& self); +at::Tensor npu_fast_gelu(const at::Tensor& self); +at::Tensor npu_anchor_response_flags(const at::Tensor& self, at::IntArrayRef featmap_size, at::IntArrayRef stride, int64_t num_base_anchors); +at::Tensor npu_bounding_box_decode( + const at::Tensor& rois, + const at::Tensor& deltas, + double means0, + double means1, + double means2, + double means3, + double stds0, + double stds1, + double stds2, + double stds3, + at::IntArrayRef max_shape, + double wh_ratio_clip); +at::Tensor npu_bounding_box_encode( + const at::Tensor& anchor_box, + const at::Tensor& ground_truth_box, + double means0, + double means1, + double means2, + double means3, + double stds0, + double stds1, + double stds2, + double stds3); +std::tuple npu_batch_nms( + const at::Tensor& self, + const at::Tensor& scores, + double score_threshold, + double iou_threshold, + int64_t max_size_per_class, + int64_t max_total_size, + bool change_coordinate_frame, + bool transpose_box); +at::Tensor npu_confusion_transpose( + const at::Tensor& self, + at::IntArrayRef perm, + at::IntArrayRef shape, + bool transpose_first); +at::Tensor npu_confusion_transpose_backward( + const at::Tensor& grad, + at::IntArrayRef perm, + at::IntArrayRef shape, + bool transpose_first); +at::Tensor npu_conv_transpose2d( + const at::Tensor& input, + const at::Tensor& weight, + const c10::optional& bias_opt, + at::IntArrayRef padding, + at::IntArrayRef output_padding, + at::IntArrayRef stride, + at::IntArrayRef dilation, + int64_t groups); +at::Tensor npu_broadcast(const at::Tensor& self, at::IntArrayRef size); +at::Tensor& npu_broadcast_out(const at::Tensor& self, at::IntArrayRef size, at::Tensor& result); + +#endif // __FUNCTIONS_H__ diff --git a/ads/common/ops/csrc/pybind.cpp b/ads/common/ops/csrc/pybind.cpp index 4eb1cf6fcc8093268c3074b93e675921f0027e17..b8ebe3f5250e9add59301ffab14216dcf2b18539 100644 --- a/ads/common/ops/csrc/pybind.cpp +++ b/ads/common/ops/csrc/pybind.cpp @@ -43,4 +43,27 @@ void init_common(pybind11::module &m) m.def("npu_rotary_mul", &npu_rotary_mul); m.def("npu_abs", &npu_abs); + + // npu_fast_gelu + m.def("npu_fast_gelu", &npu_fast_gelu); + m.def("npu_fast_gelu_backward", &npu_fast_gelu_backward); + + // npu_anchor_response_flags + m.def("npu_anchor_response_flags", &npu_anchor_response_flags); + + // npu_bounding_box_decode + m.def("npu_bounding_box_decode", &npu_bounding_box_decode); + + // npu_bounding_box_encode + m.def("npu_bounding_box_encode", &npu_bounding_box_encode); + + // npu_batch_nms + m.def("npu_batch_nms", &npu_batch_nms); + + // npu_confusion_transpose + m.def("npu_confusion_transpose", &npu_confusion_transpose); + m.def("npu_confusion_transpose_backward", &npu_confusion_transpose_backward); + + // npu_broadcast + m.def("npu_broadcast", &npu_broadcast); } diff --git a/ads/common/ops/fast_gelu.py b/ads/common/ops/fast_gelu.py new file mode 100644 index 0000000000000000000000000000000000000000..45557513e9ca889004a63c0822065630a8723334 --- /dev/null +++ b/ads/common/ops/fast_gelu.py @@ -0,0 +1,23 @@ +import torch +from torch.autograd import Function + +import torch_npu +import ads_c + + +class FastGeluFunction(Function): + @staticmethod + def forward(ctx, self): + out = ads_c.npu_fast_gelu(self) + ctx.save_for_backward(self) + return out + + @staticmethod + def backward(ctx, grad_output): + self = ctx.saved_tensors[0] + + grad = ads_c.npu_fast_gelu_backward(grad_output, self) + + return grad + +fast_gelu = FastGeluFunction.apply diff --git a/ads/common/ops/npu_anchor_response_flags.py b/ads/common/ops/npu_anchor_response_flags.py new file mode 100644 index 0000000000000000000000000000000000000000..b75fd77c218def485efdc8cbace1fd1767c917bc --- /dev/null +++ b/ads/common/ops/npu_anchor_response_flags.py @@ -0,0 +1,13 @@ +import torch +from torch.autograd import Function +import torch_npu +import ads_c + + +class NpuAnchorResponseFlagsFunction(Function): + @staticmethod + def forward(ctx, self, featmap_size, stride, num_base_anchors): + result = ads_c.npu_anchor_response_flags(self, featmap_size, stride, num_base_anchors) + return result + +npu_anchor_response_flags = NpuAnchorResponseFlagsFunction.apply diff --git a/ads/common/ops/npu_batch_nms.py b/ads/common/ops/npu_batch_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..4a5b2ef97a154db5d1969757cc28d08d49c07d28 --- /dev/null +++ b/ads/common/ops/npu_batch_nms.py @@ -0,0 +1,30 @@ +import torch +from torch.autograd import Function +import torch_npu +import ads_c + + +class NpuBatchNmsFunction(Function): + @staticmethod + def forward( + ctx, + self, + scores, + score_threshold, + iou_threshold, + max_size_per_class, + max_total_size, + change_coordinate_frame=False, + transpose_box=False): + result = ads_c.npu_batch_nms( + self, + scores, + score_threshold, + iou_threshold, + max_size_per_class, + max_total_size, + change_coordinate_frame, + transpose_box) + return result + +npu_batch_nms = NpuBatchNmsFunction.apply diff --git a/ads/common/ops/npu_bounding_box_decode.py b/ads/common/ops/npu_bounding_box_decode.py new file mode 100644 index 0000000000000000000000000000000000000000..6f16d8e4629f5ead83b4b259b593f7fc1015711c --- /dev/null +++ b/ads/common/ops/npu_bounding_box_decode.py @@ -0,0 +1,20 @@ +import torch +from torch.autograd import Function +import torch_npu +import ads_c + + +class NpuBoundingBodDecodeFunction(Function): + @staticmethod + def forward(ctx, rois, deltas, + means0, means1, means2, means3, + stds0, stds1, stds2, stds3, + max_shape, wh_ratio_clip): + result = ads_c.npu_bounding_box_decode( + rois, deltas, + means0, means1, means2, means3, + stds0, stds1, stds2, stds3, + max_shape, wh_ratio_clip) + return result + +npu_bounding_box_decode = NpuBoundingBodDecodeFunction.apply diff --git a/ads/common/ops/npu_bounding_box_encode.py b/ads/common/ops/npu_bounding_box_encode.py new file mode 100644 index 0000000000000000000000000000000000000000..756e1fe0b933f955ecc308ac2881faaaeb6f8bb1 --- /dev/null +++ b/ads/common/ops/npu_bounding_box_encode.py @@ -0,0 +1,18 @@ +import torch +from torch.autograd import Function +import torch_npu +import ads_c + + +class NpuBoundingBodEncodeFunction(Function): + @staticmethod + def forward(ctx, anchor_box, ground_truth_box, + means0, means1, means2, means3, + stds0, stds1, stds2, stds3): + result = ads_c.npu_bounding_box_encode( + anchor_box, ground_truth_box, + means0, means1, means2, means3, + stds0, stds1, stds2, stds3) + return result + +npu_bounding_box_encode = NpuBoundingBodEncodeFunction.apply \ No newline at end of file diff --git a/ads/common/ops/npu_broadcast.py b/ads/common/ops/npu_broadcast.py new file mode 100644 index 0000000000000000000000000000000000000000..b3371b283a10ca9b87fd6bd2bbe7cc85fdb40fe3 --- /dev/null +++ b/ads/common/ops/npu_broadcast.py @@ -0,0 +1,16 @@ +import torch +from torch.autograd import Function +import torch_npu +import ads_c + + +class BroadCastlFunction(Function): + @staticmethod + def forward(ctx, self, size, out=None): + if out is None: + result = ads_c.npu_broadcast(self, size) + else: + result = ads_c.npu_broadcast_out(self, size, out) + return result + +npu_broadcast = BroadCastlFunction.apply \ No newline at end of file diff --git a/ads/common/ops/npu_confusion_transpose.py b/ads/common/ops/npu_confusion_transpose.py new file mode 100644 index 0000000000000000000000000000000000000000..566d19f1538e9e7fb229a549aeb2278f9d140d15 --- /dev/null +++ b/ads/common/ops/npu_confusion_transpose.py @@ -0,0 +1,24 @@ +import torch +from torch.autograd import Function +from torch.nn import Module + +import torch_npu +import ads_c + + +class NpuConfusionTransposeFunction(Function): + @staticmethod + def forward(ctx, self, perm, shape, transpose_first): + out = ads_c.npu_confusion_transpose(self, perm, shape, transpose_first) + ctx.save_for_backward(perm, self.sizes(), transpose_first) + + return out + + @staticmethod + def backward(ctx, grad_output): + perm, sefl_sizes, transpose_first = ctx.saved_tensors + out = ads_c.npu_confusion_transpose_backward(grad_output, perm, sefl_sizes, not transpose_first) + + return out, None, None, None + +npu_confusion_transpose = NpuConfusionTransposeFunction.apply diff --git a/ads/common/ops/rotary_mul.py b/ads/common/ops/rotary_mul.py index bf9c2a9b05288e9e15125585feb97ecbaf5cadd9..5079961bb1c4014740917a7e803b02230bcff962 100644 --- a/ads/common/ops/rotary_mul.py +++ b/ads/common/ops/rotary_mul.py @@ -19,4 +19,4 @@ class RotaryMulFunction(Function): result = ads_c.npu_rotary_mul_backward(grad_output, input, r1, r2) return result -npu_rotary_mul = RotaryMulFunction.apply \ No newline at end of file +npu_rotary_mul = RotaryMulFunction.apply diff --git a/ads/common/ops/rotated_iou.py b/ads/common/ops/rotated_iou.py index d88d3e9b5c03470a26d0463e39a451b630429c2a..896f001cfb36b75c8bde3da42e567654f7680b69 100644 --- a/ads/common/ops/rotated_iou.py +++ b/ads/common/ops/rotated_iou.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_rotated_iou = ads_c.npu_rotated_iou \ No newline at end of file +npu_rotated_iou = ads_c.npu_rotated_iou diff --git a/ads/common/ops/rotated_overlaps.py b/ads/common/ops/rotated_overlaps.py index 407532351c90419c3f2fec71dda686a015232019..c481fe6fdfed168f130aadd268437bba2d5bd3b9 100644 --- a/ads/common/ops/rotated_overlaps.py +++ b/ads/common/ops/rotated_overlaps.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_rotated_overlaps = ads_c.npu_rotated_overlaps \ No newline at end of file +npu_rotated_overlaps = ads_c.npu_rotated_overlaps diff --git a/ads/common/ops/scatter.py b/ads/common/ops/scatter.py index 7d89109cb8505c46fcb21b1fe9d65beb4a992208..d9e6de8ae0d69bc583e3545aeee27d13c9b39e57 100644 --- a/ads/common/ops/scatter.py +++ b/ads/common/ops/scatter.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_scatter = ads_c.npu_scatter \ No newline at end of file +npu_scatter = ads_c.npu_scatter diff --git a/ads/common/ops/sign_bits_pack.py b/ads/common/ops/sign_bits_pack.py index c09d486ae571edf6498514e28bdcf9b1db543877..7b1e0040dffb3bcd6ccaa7f9a00ab05231a5cec9 100644 --- a/ads/common/ops/sign_bits_pack.py +++ b/ads/common/ops/sign_bits_pack.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_sign_bits_pack = ads_c.npu_sign_bits_pack \ No newline at end of file +npu_sign_bits_pack = ads_c.npu_sign_bits_pack diff --git a/ads/common/ops/sign_bits_unpack.py b/ads/common/ops/sign_bits_unpack.py index efa1a2dd409f168ce850bf79746f2b8986469401..ed374e17f3b22b513ebc88ed2cf141f36940671b 100644 --- a/ads/common/ops/sign_bits_unpack.py +++ b/ads/common/ops/sign_bits_unpack.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_sign_bits_unpack = ads_c.npu_sign_bits_unpack \ No newline at end of file +npu_sign_bits_unpack = ads_c.npu_sign_bits_unpack diff --git a/ads/common/ops/silu.py b/ads/common/ops/silu.py index 8ca866dbc07e36c05f0fab507935947c95fe4304..bd4251b0079fe993e8cb3e9554971fac0f5d72a8 100644 --- a/ads/common/ops/silu.py +++ b/ads/common/ops/silu.py @@ -13,7 +13,7 @@ class SiluFunction(Function): result = func(input) ctx.save_for_backward(input, result) return result - + @staticmethod def backward(ctx, grad_outputs): x0, x1 = ctx.saved_tensors @@ -22,4 +22,4 @@ class SiluFunction(Function): npu_silu = SiluFunction.apply -npu_silu_ = ads_c.npu_silu_ \ No newline at end of file +npu_silu_ = ads_c.npu_silu_ diff --git a/ads/common/ops/softmax_cross_entropy_with_logits.py b/ads/common/ops/softmax_cross_entropy_with_logits.py index f09d2a3e74b77d500d5ad579de32e0996893127a..cd12c5dd58582adce1960c0a0a353a1483b0c706 100644 --- a/ads/common/ops/softmax_cross_entropy_with_logits.py +++ b/ads/common/ops/softmax_cross_entropy_with_logits.py @@ -20,4 +20,4 @@ class SoftMaxFunction(Function): result = ads_c.npu_softmax_cross_entropy_with_logits_backward(grad_output, feature, labels) return result -npu_softmax_cross_entropy_with_logits = SoftMaxFunction.apply \ No newline at end of file +npu_softmax_cross_entropy_with_logits = SoftMaxFunction.apply diff --git a/ads/common/ops/stride_add.py b/ads/common/ops/stride_add.py index 24a3946b2741cd6e47b3ee4c834e248f54fe5fc9..586a83c35ae6c915826f953b81cdad5da543f62d 100644 --- a/ads/common/ops/stride_add.py +++ b/ads/common/ops/stride_add.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_stride_add = ads_c.npu_stride_add \ No newline at end of file +npu_stride_add = ads_c.npu_stride_add diff --git a/ads/common/ops/transpose.py b/ads/common/ops/transpose.py index 149722993e3ca9946f1ae43d8f766715418b077d..a27dca7dd62c026c4725954074e1511d5ee75f52 100644 --- a/ads/common/ops/transpose.py +++ b/ads/common/ops/transpose.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_transpose = ads_c.npu_transpose \ No newline at end of file +npu_transpose = ads_c.npu_transpose diff --git a/ads/common/ops/yolo_boxes_encode.py b/ads/common/ops/yolo_boxes_encode.py index 585adb586d2d5dd63dbe2734d7995d50a7c6387f..cb915a0f586b86dbde0bccfe48715ef625b0fb5c 100644 --- a/ads/common/ops/yolo_boxes_encode.py +++ b/ads/common/ops/yolo_boxes_encode.py @@ -2,4 +2,4 @@ import torch import torch_npu import ads_c -npu_yolo_boxes_encode = ads_c.npu_yolo_boxes_encode \ No newline at end of file +npu_yolo_boxes_encode = ads_c.npu_yolo_boxes_encode diff --git a/tests/test_batch_nms.py b/tests/test_batch_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..11c3245c510a1be5d72ccbbbd6391cdde795d30f --- /dev/null +++ b/tests/test_batch_nms.py @@ -0,0 +1,44 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch_npu + +from torch_npu.testing.testcase import TestCase, run_tests +from torch_npu.testing.common_utils import create_common_tensor +import ads.common + + +class TesBatchNms(TestCase): + def test_batch_nms_shape_format(self): + boxes = torch.randn(8, 4, 1, 4).npu() + scores = torch.randn(8, 4, 1).npu() + boxes_fp16 = boxes.half() + scores_fp16 = scores.half() + nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = ads.common.npu_batch_nms(boxes, scores, 0.3, 0.5, 4, 4) + boxes1, scores1, classes1, num1 = ads.common.npu_batch_nms(boxes_fp16, scores_fp16, 0.3, 0.5, 4, 4) + expedt_nmsed_classes = torch.tensor([[0.0000, 0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000, 0.0000]], dtype=torch.float32) + self.assertRtolEqual(expedt_nmsed_classes, nmsed_classes.cpu()) + self.assertRtolEqual(expedt_nmsed_classes.half(), classes1.cpu()) + + +if __name__ == "__main__": + run_tests() diff --git a/tests/test_fast_gelu.py b/tests/test_fast_gelu.py new file mode 100644 index 0000000000000000000000000000000000000000..c81b5e92ae5fc72463ef2e2aeda41879ed9c1055 --- /dev/null +++ b/tests/test_fast_gelu.py @@ -0,0 +1,51 @@ +# Copyright (c) 2023 Huawei Technologies Co., Ltd +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +from torch_npu.testing.common_utils import create_common_tensor +import ads.common + + +class TestFastGelu(TestCase): + + def supported_op_exec(self, input1): + attr = 1.702 + attr_half = attr / 2 + abs_input1 = torch.abs(input1) + numerator = input1 * \ + torch.exp((attr_half * input1) * (input1 - abs_input1)) + denominator = 1.0 + torch.exp(- attr * abs_input1) + output = numerator / denominator + return output.cpu().detach() + + def custom_op_exec(self, input1): + output = ads.common.fast_gelu(input1) + return output.cpu().detach() + + def test_fast_gelu(self, device="npu"): + item = [np.float32, 0, [3, 16, 32]] + _, npu_input = create_common_tensor(item, 0, 100) + + supported_output = self.supported_op_exec(npu_input) + custom_output = self.custom_op_exec(npu_input) + self.assertRtolEqual(supported_output, custom_output) + + +if __name__ == "__main__": + run_tests() diff --git a/tests/test_fast_gelu_backward.py b/tests/test_fast_gelu_backward.py new file mode 100644 index 0000000000000000000000000000000000000000..1c7920ef1bc3d7b4aaf9f1004db348e6f66980bf --- /dev/null +++ b/tests/test_fast_gelu_backward.py @@ -0,0 +1,43 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import numpy as np + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +import ads.common + + +class TestFastGelu(TestCase): + def npu_op_exec(self, input1): + input1.requires_grad = True + output = ads.common.fast_gelu(input1) + output.backward(torch.ones_like(output)) + output_grad = input1.grad + output_grad = output_grad.to("cpu") + output_grad = output_grad.detach().numpy() + output = output.cpu().detach().numpy() + return output_grad, output + + def test_fastgelu(self, device="npu"): + input1 = torch.tensor([1., 2., 3., 4.]).npu() + exoutputgrad = torch.tensor([1.0677795, 1.0738151, 1.0245483, 1.0064018]) + exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956]) + outputgrad, output = self.npu_op_exec(input1) + self.assertRtolEqual(exoutputgrad.numpy(), outputgrad) + self.assertRtolEqual(exoutput.numpy(), output) + + +if __name__ == "__main__": + run_tests() diff --git a/tests/test_npu_anchor_response_flags.py b/tests/test_npu_anchor_response_flags.py new file mode 100644 index 0000000000000000000000000000000000000000..b656ee94af4f88995a9aaf7b6a454a9439ed6ec0 --- /dev/null +++ b/tests/test_npu_anchor_response_flags.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +from torch_npu.testing.common_utils import create_common_tensor +import ads.common + + +class TestNpuAnchorResponseFlags(TestCase): + def custom_op_exec(self, gt_bboxes, featmap_size, strides, num_base_anchors): + if gt_bboxes.dtype == torch.float16: + gt_bboxes = gt_bboxes.to(torch.float32) + feat_h, feat_w = featmap_size + gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5) + gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5) + gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / strides[0]).int() + gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / strides[1]).int() + gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x + responsible_grid = torch.zeros(feat_h * feat_w, dtype=torch.uint8).npu() + gt_bboxes_grid_idx = gt_bboxes_grid_idx.long() + responsible_grid[gt_bboxes_grid_idx] = 1 + responsible_grid = responsible_grid[:, None].expand( + responsible_grid.size(0), num_base_anchors).contiguous().view(-1) + return responsible_grid.cpu().numpy() + + def npu_op_exec(self, input_npu, featmap_size, strides, num_base_anchors): + out = ads.common.npu_anchor_response_flags(input_npu, featmap_size, strides, num_base_anchors) + out = out.cpu().numpy() + return out + + def test_npu_anchor_response_flags(self): + shape_format = [ + [[np.float32, -1, [100, 4]], [60, 60], [2, 2], 9], + [[np.float16, -1, [200, 4]], [10, 10], [32, 32], 3], + [[np.float16, -1, [500, 4]], [32, 32], [16, 16], 5] + ] + for item in shape_format: + _, npu_input = create_common_tensor(item[0], 0, 100) + custom_output = self.custom_op_exec(npu_input, *item[1:]) + npu_output = self.npu_op_exec(npu_input, *item[1:]) + self.assertRtolEqual(custom_output, npu_output) + + +if __name__ == "__main__": + run_tests() diff --git a/tests/test_npu_bounding_box_decode.py b/tests/test_npu_bounding_box_decode.py new file mode 100644 index 0000000000000000000000000000000000000000..248fe36c8b756faa91610bbb3e0190c99a1d19f4 --- /dev/null +++ b/tests/test_npu_bounding_box_decode.py @@ -0,0 +1,107 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +import ads.common + + +class TestBoundingBoxDecode(TestCase): + def npu_bounding_box_decode(self, rois, deltas, means0, means1, means2, means3, + stds0, stds1, stds2, stds3, max_shape, wh_ratio_clip): + means = [means0, means1, means2, means3] + stds = [stds0, stds1, stds2, stds3] + means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4) + stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4) + denorm_deltas = deltas * stds + means + + dx = denorm_deltas[:, 0::4] + dy = denorm_deltas[:, 1::4] + dw = denorm_deltas[:, 2::4] + dh = denorm_deltas[:, 3::4] + max_ratio = torch.abs(torch.log(torch.tensor(wh_ratio_clip))) + + dw = torch.clamp(dw, min=-max_ratio, max=max_ratio) + dh = torch.clamp(dh, min=-max_ratio, max=max_ratio) + + ax = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx) + ay = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy) + aw = (rois[:, 2] - rois[:, 0] * 0.5).unsqueeze(1).expand_as(dw) + ah = (rois[:, 3] - rois[:, 1] * 0.5).unsqueeze(1).expand_as(dh) + + pw = aw * dw.exp() + ph = ah * dh.exp() + px = torch.addcmul(ax, 1, aw, dx) + py = torch.addcmul(ay, 1, ah, dy) + + x1 = px - pw * 0.5 + 0.5 + y1 = py - ph * 0.5 + 0.5 + x2 = px + pw * 0.5 - 0.5 + y2 = py + ph * 0.5 - 0.5 + + if max_shape is not None: + x1 = torch.clamp(x1, min=0, max=(max_shape[1] - 1)) + y1 = torch.clamp(y1, min=0, max=(max_shape[0] - 1)) + x2 = torch.clamp(x2, min=0, max=(max_shape[1] - 1)) + y2 = torch.clamp(y2, min=0, max=(max_shape[0] - 1)) + boxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas) + return boxes + + def custom_op_exec(self, rois, deltas, means0, means1, means2, means3, + stds0, stds1, stds2, stds3, max_shape, wh_ratio_clip): + output = self.npu_bounding_box_decode(rois, deltas, means0, means1, + means2, means3, stds0, stds1, + stds2, stds3, max_shape, wh_ratio_clip) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec(self, rois, deltas, means0, means1, means2, means3, + stds0, stds1, stds2, stds3, max_shape, wh_ratio_clip): + output = ads.common.npu_bounding_box_decode(rois, deltas, means0, means1, + means2, means3, stds0, stds1, + stds2, stds3, max_shape, wh_ratio_clip) + output = output.to("cpu") + output = output.numpy() + return output + + def test_decode_shape_format_fp32(self): + input1 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]], + dtype=torch.float32).to("npu") + input2 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]], + dtype=torch.float32).to("npu") + + npu_output = self.npu_op_exec(input1, input2, 0, 0, 0, 0, + 1, 1, 1, 1, (10, 10), 0.1) + custom_output = self.custom_op_exec(input1, input2, 0, 0, 0, 0, + 1, 1, 1, 1, (10, 10), 0.1) + self.assertRtolEqual(npu_output, custom_output) + + def test_decode_shape_format_fp16(self): + input1_fp16 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]], + dtype=torch.float16).to("npu") + input2_fp16 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]], + dtype=torch.float16).to("npu") + + npu_output = self.npu_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0, + 1, 1, 1, 1, (10, 10), 0.1) + custom_output = self.custom_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0, + 1, 1, 1, 1, (10, 10), 0.1) + self.assertRtolEqual(npu_output, custom_output) + + +if __name__ == "__main__": + run_tests() diff --git a/tests/test_npu_bounding_box_encode.py b/tests/test_npu_bounding_box_encode.py new file mode 100644 index 0000000000000000000000000000000000000000..b6a406b68c18c2d414717402c75833542901117b --- /dev/null +++ b/tests/test_npu_bounding_box_encode.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +import ads.common + + +class TestBoundingBoxEncode(TestCase): + def npu_bounding_box_encode(self, anchor_box, ground_truth_box, means0, means1, + means2, means3, stds0, stds1, stds2, stds3): + means = [means0, means1, means2, means3] + stds = [stds0, stds1, stds2, stds3] + px = (anchor_box[..., 0] + anchor_box[..., 2]) * 0.5 + py = (anchor_box[..., 1] + anchor_box[..., 3]) * 0.5 + pw = anchor_box[..., 2] - anchor_box[..., 0] + 1.0 + ph = anchor_box[..., 3] - anchor_box[..., 1] + 1.0 + + gx = (ground_truth_box[..., 0] + ground_truth_box[..., 2]) * 0.5 + gy = (ground_truth_box[..., 1] + ground_truth_box[..., 3]) * 0.5 + gw = ground_truth_box[..., 2] - ground_truth_box[..., 0] + 1.0 + gh = ground_truth_box[..., 3] - ground_truth_box[..., 1] + 1.0 + + eps = 1e-7 + dx = (gx - px) / (pw + eps) + dy = (gy - py) / (ph + eps) + dw = torch.log(torch.abs(gw) / torch.abs(pw + eps)) + dh = torch.log(torch.abs(gh) / torch.abs(ph + eps)) + deltas = torch.stack([dx, dy, dw, dh], dim=-1) + + means = deltas.new_tensor(means).unsqueeze(0) + stds = deltas.new_tensor(stds).unsqueeze(0) + deltas = deltas.sub_(means) .div_(stds) + + return deltas + + def custom_op_exec(self, anchor_box, ground_truth_box, means0, means1, + means2, means3, stds0, stds1, stds2, stds3): + output = self.npu_bounding_box_encode(anchor_box, ground_truth_box, means0, means1, + means2, means3, stds0, stds1, stds2, stds3) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec(self, anchor_box, ground_truth_box, means0, means1, + means2, means3, stds0, stds1, stds2, stds3): + output = ads.common.npu_bounding_box_encode(anchor_box, ground_truth_box, means0, means1, + means2, means3, stds0, stds1, stds2, stds3) + output = output.to("cpu") + output = output.numpy() + return output + + def test_encode_shape_format_fp32(self): + input1 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]], + dtype=torch.float32).to("npu") + input2 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]], + dtype=torch.float32).to("npu") + + npu_output = self.npu_op_exec(input1, input2, 0, 0, 0, 0, + 0.1, 0.1, 0.2, 0.2) + custom_output = self.custom_op_exec(input1, input2, 0, 0, 0, 0, + 0.1, 0.1, 0.2, 0.2) + self.assertRtolEqual(npu_output, custom_output, 1e-3) + + def test_encode_shape_format_fp16(self): + input1_fp16 = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]], + dtype=torch.float16).to("npu") + input2_fp16 = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]], + dtype=torch.float16).to("npu") + + npu_output = self.npu_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0, + 0.1, 0.1, 0.2, 0.2) + custom_output = self.custom_op_exec(input1_fp16, input2_fp16, 0, 0, 0, 0, + 0.1, 0.1, 0.2, 0.2) + self.assertRtolEqual(npu_output, custom_output, 1e-3) + + +if __name__ == "__main__": + run_tests() diff --git a/tests/test_npu_broadcast.py b/tests/test_npu_broadcast.py new file mode 100644 index 0000000000000000000000000000000000000000..d44badc1f522c195cdf505c166e9600c55f79dc4 --- /dev/null +++ b/tests/test_npu_broadcast.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np + +import torch_npu +import ads.common +from torch_npu.testing.testcase import TestCase, run_tests + + +class TestNpuBroadcast(TestCase): + def custom_op_exec(self, input1, shape): + output = torch.broadcast_to(input1, shape) + output = output.to("cpu") + output = output.numpy() + return output + + def npu_op_exec(self, input1, size): + output = ads.common.npu_broadcast(input1, size) + output = output.to("cpu") + output = output.numpy() + return output + + def test_npu_broadcast(self): + input1 = [ + torch.tensor([1, 2, 3]).npu(), + torch.tensor([[1], [2], [3]]).npu() + ] + for item in input1: + custom_output = self.custom_op_exec(item, (3, 3)) + npu_output = self.npu_op_exec(item, (3, 3)) + self.assertRtolEqual(custom_output, npu_output) + + +if __name__ == "__main__": + run_tests()