diff --git a/patch/npu.patch b/patch/npu.patch index 5c7fa69d23b0705a6b5dda56e75ef804a3ff2321..a46a27034ab9d7433a4e60c9d68d86898a340dab 100644 --- a/patch/npu.patch +++ b/patch/npu.patch @@ -1,6 +1,6 @@ -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt --- pytorch-v1.5.0/aten/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/CMakeLists.txt 2021-06-25 16:37:35.486258833 +0800 ++++ pytorch-develop/aten/CMakeLists.txt 2021-07-05 14:59:26.416336304 +0800 @@ -22,8 +22,10 @@ set(ATen_CPU_INCLUDE) set(ATen_THIRD_PARTY_INCLUDE) @@ -49,9 +49,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE) set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt --- pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/CMakeLists.txt 2021-06-25 16:37:35.486258833 +0800 ++++ pytorch-develop/aten/src/ATen/CMakeLists.txt 2021-07-05 14:59:26.416336304 +0800 @@ -67,6 +67,9 @@ FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h") FILE(GLOB native_cpu_h "native/cpu/*.h") @@ -127,9 +127,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h --- pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h 2021-06-25 16:37:35.494258894 +0800 ++++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h 2021-07-05 14:59:26.424336365 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -168,9 +168,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } catchallKernel_ = std::move(kernel); } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py --- pytorch-v1.5.0/aten/src/ATen/function_wrapper.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/function_wrapper.py 2021-06-25 16:37:35.502258955 +0800 ++++ pytorch-develop/aten/src/ATen/function_wrapper.py 2021-07-05 14:59:26.432336426 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -353,9 +353,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= for declaration in declarations: for option in declaration['options']: -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py --- pytorch-v1.5.0/aten/src/ATen/gen.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/gen.py 2021-06-25 16:37:35.502258955 +0800 ++++ pytorch-develop/aten/src/ATen/gen.py 2021-07-05 14:59:26.432336426 +0800 @@ -1,3 +1,18 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -511,9 +511,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + npu_file_manager.write_outputs(options.output_dependencies + "-npu") else: generate_outputs() -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp --- pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp 2021-06-25 16:37:35.514259047 +0800 ++++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp 2021-07-05 14:59:26.444336518 +0800 @@ -339,20 +339,20 @@ void hardsigmoid_backward_kernel(TensorIterator& iter) { @@ -539,9 +539,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return Vec::blendv(kZeroVec, grad_val * kOneSixthVec, gradNonZeroMask); }); }); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp --- pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/Memory.cpp 2021-06-25 16:37:35.506258986 +0800 ++++ pytorch-develop/aten/src/ATen/native/Memory.cpp 2021-07-05 14:59:26.440336488 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -594,9 +594,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= auto storage = Storage( self.dtype(), detail::computeStorageSize(self.sizes(), self.strides()), -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml --- pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/native_functions.yaml 2021-06-25 16:37:35.526259138 +0800 ++++ pytorch-develop/aten/src/ATen/native/native_functions.yaml 2021-07-05 14:59:26.460336640 +0800 @@ -1,6 +1,5 @@ # See README.md in this directory for more guidance @@ -998,7 +998,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) variants: function -@@ -503,13 +598,17 @@ +@@ -503,6 +598,8 @@ CPU: bernoulli_tensor_cpu_ CUDA: bernoulli_tensor_cuda_ supports_named_tensor: True @@ -1007,11 +1007,10 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method - dispatch: +@@ -510,6 +607,8 @@ CPU: bernoulli_scalar_cpu_ CUDA: bernoulli_scalar_cuda_ -- supports_named_tensor: True -+ supports_named_tensor: True + supports_named_tensor: True + npu_dispatch: + NPU: bernoulli_npu_ @@ -1394,7 +1393,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: cosh_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -825,13 +1030,17 @@ +@@ -825,12 +1030,16 @@ dispatch: CPU: _cosh__cpu CUDA: _cosh__cuda @@ -1406,13 +1405,11 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= dispatch: CPU: _cosh_out_cpu CUDA: _cosh_out_cuda -- + npu_dispatch: + NPU: cosh_out_npu -+ + - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full - @@ -897,6 +1106,50 @@ dispatch: CUDA: cudnn_convolution_transpose_backward_weight @@ -1514,7 +1511,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor supports_named_tensor: True -@@ -976,25 +1245,33 @@ +@@ -976,20 +1245,28 @@ supports_named_tensor: True - func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor @@ -1544,12 +1541,6 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: det(Tensor self) -> Tensor use_c10_dispatcher: full - variants: function, method -- -+ - - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor - use_c10_dispatcher: full - variants: function, method @@ -1013,6 +1290,8 @@ - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!) @@ -1601,7 +1592,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: dot(Tensor self, Tensor tensor) -> Tensor use_c10_dispatcher: full -@@ -1057,30 +1346,42 @@ +@@ -1057,29 +1346,41 @@ dispatch: CPU: legacy::cpu::_th_dot CUDA: legacy::cuda::_th_dot @@ -1638,13 +1629,11 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= dispatch: CPU: embedding_renorm_cpu_ CUDA: embedding_renorm_cuda_ -- + npu_dispatch: + NPU: embedding_renorm_npu_ -+ + - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor use_c10_dispatcher: full - @@ -1099,6 +1400,8 @@ dispatch: CPU: _embedding_bag_cpu @@ -1835,7 +1824,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= SparseCPU: floor_divide_sparse SparseCUDA: floor_divide_sparse supports_named_tensor: True -+ npu_dispatch: ++ npu_dispatch: + NPU: floor_divide_npu - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) @@ -1844,7 +1833,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= SparseCPU: floor_divide_sparse_ SparseCUDA: floor_divide_sparse_ supports_named_tensor: True -+ npu_dispatch: ++ npu_dispatch: + NPU: floor_divide_npu_ - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -1853,13 +1842,13 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= SparseCPU: floor_divide_out_sparse_zerodim SparseCUDA: floor_divide_out_sparse_zerodim supports_named_tensor: True -+ npu_dispatch: ++ npu_dispatch: + NPU: floor_divide_out_npu - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor variants: function, method supports_named_tensor: True -+ npu_dispatch: ++ npu_dispatch: + NPU: floor_divide_npu - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) @@ -1915,7 +1904,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor use_c10_dispatcher: full -@@ -1390,32 +1768,53 @@ +@@ -1390,23 +1768,39 @@ dispatch: CPU: grid_sampler_3d_cpu CUDA: grid_sampler_3d_cuda @@ -1942,23 +1931,20 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + NPU: hamming_window_npu - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor -- + npu_dispatch: + NPU: hamming_window_npu -+ + - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor -- + npu_dispatch: + NPU: hamming_window_npu -+ + - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor -- + npu_dispatch: + NPU: hamming_window_npu -+ + - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full - +@@ -1414,8 +1808,13 @@ - func: ger(Tensor self, Tensor vec2) -> Tensor use_c10_dispatcher: full variants: function, method @@ -2072,7 +2058,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= dispatch: CPU: kthvalue_out_cpu CUDA: kthvalue_out_cuda -+ npu_dispatch: ++ npu_dispatch: + NPU: kthvalue_out_npu - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) @@ -2083,7 +2069,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) supports_named_tensor: True -+ npu_dispatch: ++ npu_dispatch: + NPU: kthvalue_out_npu - func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor @@ -2104,7 +2090,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor python_module: nn -@@ -1622,26 +2055,36 @@ +@@ -1622,46 +2055,64 @@ use_c10_dispatcher: full - func: linspace(Scalar start, Scalar end, int steps=100, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2141,7 +2127,26 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: log10(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -1662,6 +2105,8 @@ + supports_named_tensor: True + variants: function, method ++ npu_dispatch: ++ NPU: log10_npu + + - func: log10_(Tensor(a!) self) -> Tensor(a!) + supports_named_tensor: True + variants: function, method ++ npu_dispatch: ++ NPU: log10_npu_ + + - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + supports_named_tensor: True + dispatch: + CPU: log10_out + CUDA: log10_out ++ npu_dispatch: ++ NPU: log10_out_npu + + - func: log1p(Tensor self) -> Tensor use_c10_dispatcher: full supports_named_tensor: True variants: function, method @@ -2150,7 +2155,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: log1p_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -1671,6 +2116,8 @@ +@@ -1671,6 +2122,8 @@ CUDA: log1p_ SparseCPU: log1p_sparse_ SparseCUDA: log1p_sparse_ @@ -2159,7 +2164,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -1679,67 +2126,95 @@ +@@ -1679,67 +2132,95 @@ CUDA: log1p_out SparseCPU: log1p_out_sparse SparseCUDA: log1p_out_sparse @@ -2255,7 +2260,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full -@@ -1748,9 +2223,13 @@ +@@ -1748,9 +2229,13 @@ use_c10_dispatcher: full variants: function, method supports_named_tensor: True @@ -2269,7 +2274,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor use_c10_dispatcher: full -@@ -1761,26 +2240,40 @@ +@@ -1761,26 +2246,40 @@ - func: matrix_power(Tensor self, int n) -> Tensor use_c10_dispatcher: full variants: function, method @@ -2310,7 +2315,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) -@@ -1791,6 +2284,8 @@ +@@ -1791,6 +2290,8 @@ - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor supports_named_tensor: True @@ -2319,7 +2324,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor requires_tensor: True -@@ -1801,6 +2296,8 @@ +@@ -1801,6 +2302,8 @@ requires_tensor: True dispatch: QuantizedCPU: quantized_max_pool2d @@ -2328,7 +2333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor supports_named_tensor: True -@@ -1814,6 +2311,8 @@ +@@ -1814,6 +2317,8 @@ CPU: mean_cpu_gpu CUDA: mean_cpu_gpu QuantizedCPU: quantized_mean_cpu @@ -2337,7 +2342,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method -@@ -1822,6 +2321,8 @@ +@@ -1822,6 +2327,8 @@ CPU: mean_cpu_gpu CUDA: mean_cpu_gpu QuantizedCPU: quantized_mean_cpu @@ -2346,7 +2351,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -1829,47 +2330,73 @@ +@@ -1829,47 +2336,73 @@ CPU: mean_out_cpu_gpu CUDA: mean_out_cpu_gpu QuantizedCPU: quantized_mean_out_cpu @@ -2420,7 +2425,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor -@@ -1958,6 +2485,8 @@ +@@ -1958,6 +2491,8 @@ CUDA: legacy::cuda::_th_mm SparseCPU: _sparse_mm SparseCUDA: _sparse_mm @@ -2429,7 +2434,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) -@@ -1966,6 +2495,8 @@ +@@ -1966,6 +2501,8 @@ CUDA: legacy::cuda::_th_mm_out SparseCPU: _sparse_mm_out SparseCUDA: _sparse_mm_out @@ -2438,7 +2443,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor -@@ -1974,9 +2505,13 @@ +@@ -1974,9 +2511,13 @@ - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) supports_named_tensor: True variants: function, method @@ -2452,7 +2457,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method -@@ -1994,6 +2529,8 @@ +@@ -1994,6 +2535,8 @@ SparseCPU: mul_sparse SparseCUDA: mul_sparse MkldnnCPU: mkldnn_mul @@ -2461,7 +2466,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) -@@ -2004,6 +2541,8 @@ +@@ -2004,6 +2547,8 @@ SparseCPU: mul_sparse_ SparseCUDA: mul_sparse_ MkldnnCPU: mkldnn_mul_ @@ -2470,7 +2475,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) -@@ -2013,15 +2552,21 @@ +@@ -2013,15 +2558,21 @@ SparseCPU: mul_out_sparse_cpu SparseCUDA: mul_out_sparse_cuda MkldnnCPU: mkldnn_mul_out @@ -2492,7 +2497,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mv(Tensor self, Tensor vec) -> Tensor use_c10_dispatcher: full -@@ -2030,12 +2575,16 @@ +@@ -2030,12 +2581,16 @@ CPU: mv_cpu CUDA: legacy::cuda::_th_mv supports_named_tensor: True @@ -2509,7 +2514,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mvlgamma(Tensor self, int p) -> Tensor use_c10_dispatcher: full -@@ -2052,6 +2601,8 @@ +@@ -2052,6 +2607,8 @@ CUDA: narrow_copy_dense SparseCPU: narrow_copy_sparse SparseCUDA: narrow_copy_sparse @@ -2518,7 +2523,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a) variants: function, method -@@ -2068,6 +2619,8 @@ +@@ -2068,6 +2625,8 @@ CPU: batch_norm_cpu CUDA: batch_norm_cuda MkldnnCPU: mkldnn_batch_norm @@ -2527,7 +2532,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) dispatch: -@@ -2098,6 +2651,8 @@ +@@ -2098,6 +2657,8 @@ dispatch: CPU: batch_norm_backward_cpu CUDA: batch_norm_backward_cuda @@ -2536,7 +2541,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) dispatch: -@@ -2117,6 +2672,8 @@ +@@ -2117,6 +2678,8 @@ - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor variants: function @@ -2545,7 +2550,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor) variants: function -@@ -2129,42 +2686,60 @@ +@@ -2129,42 +2692,60 @@ - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -2591,10 +2596,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _pdist_forward(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full -- + npu_dispatch: + NPU: _pdist_forward_npu -+ + - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor use_c10_dispatcher: full @@ -2602,33 +2606,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +- func: cosine_similarity(Tensor input, Tensor input2, int dim=1, float eps=1e-08) -> Tensor use_c10_dispatcher: full variants: function -- -+ + - func: permute(Tensor(a) self, int[] dims) -> Tensor(a) - variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. # Only exposed from C++ -- in Python, # we expose it as an attribute `T`, not a function. -@@ -2178,7 +2753,7 @@ - - - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor - use_c10_dispatcher: full -- -+ - - func: is_pinned(Tensor self) -> bool - use_c10_dispatcher: full - variants: method -@@ -2195,7 +2770,7 @@ - - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor - use_c10_dispatcher: full - variants: function -- -+ - - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - - - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor -@@ -2253,54 +2828,82 @@ +@@ -2253,54 +2834,82 @@ supports_named_tensor: True - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2712,7 +2697,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor use_c10_dispatcher: full -@@ -2316,6 +2919,8 @@ +@@ -2316,6 +2925,8 @@ - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor use_c10_dispatcher: full variants: function, method @@ -2721,7 +2706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: reshape(Tensor self, int[] shape) -> Tensor variants: function, method -@@ -2337,16 +2942,22 @@ +@@ -2337,16 +2948,22 @@ use_c10_dispatcher: full supports_named_tensor: True variants: function, method @@ -2744,7 +2729,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor -@@ -2360,6 +2971,8 @@ +@@ -2360,6 +2977,8 @@ CUDA: relu MkldnnCPU: mkldnn_relu QuantizedCPU: quantized_relu @@ -2753,7 +2738,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: relu_(Tensor(a!) self) -> Tensor(a!) -@@ -2370,6 +2983,8 @@ +@@ -2370,6 +2989,8 @@ CUDA: relu_ MkldnnCPU: mkldnn_relu_ QuantizedCPU: quantized_relu_ @@ -2762,7 +2747,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: prelu(Tensor self, Tensor weight) -> Tensor use_c10_dispatcher: full -@@ -2377,12 +2992,16 @@ +@@ -2377,12 +2998,16 @@ dispatch: CPU: prelu_cpu CUDA: prelu_cuda @@ -2779,17 +2764,16 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gelu(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2390,36 +3009,50 @@ +@@ -2390,6 +3015,8 @@ dispatch: CPU: gelu_cpu CUDA: gelu_cuda -- + npu_dispatch: + NPU: gelu_npu -+ + - func: gelu_backward(Tensor grad, Tensor self) -> Tensor use_c10_dispatcher: full - python_module: nn +@@ -2397,29 +3024,41 @@ dispatch: CPU: gelu_backward_cpu CUDA: gelu_backward_cuda @@ -2831,7 +2815,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) variants: function, method -@@ -2433,15 +3066,18 @@ +@@ -2433,14 +3072,21 @@ - func: selu(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2844,14 +2828,17 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor use_c10_dispatcher: full ++ npu_dispatch: ++ NPU: celu_npu - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) - - ++ npu_dispatch: ++ NPU: celu_npu_ + - func: sigmoid(Tensor self) -> Tensor use_c10_dispatcher: full - supports_named_tensor: True -@@ -2451,6 +3087,8 @@ +@@ -2451,6 +3097,8 @@ CUDA: sigmoid QuantizedCPU: quantized_sigmoid MkldnnCPU: mkldnn_sigmoid @@ -2860,7 +2847,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sigmoid_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -2459,36 +3097,52 @@ +@@ -2459,36 +3107,52 @@ CPU: sigmoid_ CUDA: sigmoid_ MkldnnCPU: mkldnn_sigmoid_ @@ -2913,7 +2900,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. -@@ -2533,6 +3187,8 @@ +@@ -2533,6 +3197,8 @@ - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) variants: function, method @@ -2922,7 +2909,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: smm(Tensor self, Tensor mat2) -> Tensor use_c10_dispatcher: full -@@ -2542,10 +3198,14 @@ +@@ -2542,10 +3208,14 @@ - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor variants: function, method supports_named_tensor: True @@ -2937,7 +2924,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor use_c10_dispatcher: full -@@ -2553,12 +3213,16 @@ +@@ -2553,12 +3223,16 @@ CPU: softmax_cpu CUDA: softmax_cuda MkldnnCPU: mkldnn_softmax @@ -2954,7 +2941,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[] variants: function, method -@@ -2609,8 +3273,12 @@ +@@ -2609,8 +3283,12 @@ SparseCUDA: _sspaddmm_out_cuda - func: stack(Tensor[] tensors, int dim=0) -> Tensor @@ -2967,7 +2954,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # The signature is designed to be consistent with librosa except that it is # missing the `pad_mode` and `center` arguments, which are taken care of at -@@ -2633,20 +3301,30 @@ +@@ -2633,20 +3311,30 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor variants: function, method supports_named_tensor: True @@ -2998,7 +2985,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sum_to_size(Tensor self, int[] size) -> Tensor variants: method -@@ -2656,13 +3334,19 @@ +@@ -2656,13 +3344,19 @@ use_c10_dispatcher: full supports_named_tensor: True variants: function, method @@ -3018,7 +3005,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: square(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2677,51 +3361,81 @@ +@@ -2677,51 +3371,81 @@ use_c10_dispatcher: full variants: function, method supports_named_tensor: True @@ -3081,19 +3068,17 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -- + npu_dispatch: + NPU: prod_out_npu + #NPU: prod_out_npu_ext -+ + - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method supports_named_tensor: True -- + npu_dispatch: + NPU: prod_npu + #NPU: prod_npu_ext -+ + - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True - @@ -3103,7 +3088,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: t(Tensor(a) self) -> Tensor(a) device_guard: False -@@ -2736,6 +3450,8 @@ +@@ -2736,6 +3460,8 @@ use_c10_dispatcher: full supports_named_tensor: True variants: function, method @@ -3112,7 +3097,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tan_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -2743,12 +3459,16 @@ +@@ -2743,12 +3469,16 @@ dispatch: CPU: _tan__cpu CUDA: _tan__cuda @@ -3129,7 +3114,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tanh(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2758,6 +3478,8 @@ +@@ -2758,6 +3488,8 @@ CPU: tanh CUDA: tanh QuantizedCPU: quantized_tanh @@ -3138,7 +3123,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tanh_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -2765,12 +3487,16 @@ +@@ -2765,12 +3497,16 @@ dispatch: CPU: _tanh__cpu CUDA: _tanh__cuda @@ -3155,7 +3140,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor variants: function -@@ -2783,6 +3509,8 @@ +@@ -2783,6 +3519,8 @@ dispatch: CPU: threshold CUDA: threshold_cuda @@ -3164,7 +3149,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!) variants: function -@@ -2790,12 +3518,16 @@ +@@ -2790,12 +3528,16 @@ dispatch: CPU: threshold_ CUDA: threshold__cuda @@ -3181,7 +3166,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor use_c10_dispatcher: full -@@ -2803,6 +3535,8 @@ +@@ -2803,6 +3545,8 @@ dispatch: CPU: threshold_backward CUDA: threshold_backward_cuda @@ -3190,7 +3175,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a) variants: function, method -@@ -2835,18 +3569,24 @@ +@@ -2835,18 +3579,24 @@ use_c10_dispatcher: full python_module: nn variants: function @@ -3215,7 +3200,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args -@@ -2872,6 +3612,8 @@ +@@ -2872,6 +3622,8 @@ CUDA: true_divide SparseCPU: true_divide_sparse SparseCUDA: true_divide_sparse @@ -3224,7 +3209,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) -@@ -2881,6 +3623,8 @@ +@@ -2881,6 +3633,8 @@ CUDA: true_divide_ SparseCPU: true_divide_sparse_ SparseCUDA: true_divide_sparse_ @@ -3233,7 +3218,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) -@@ -2889,31 +3633,43 @@ +@@ -2889,31 +3643,43 @@ CUDA: true_divide_out SparseCPU: true_divide_out_sparse_zerodim SparseCUDA: true_divide_out_sparse_zerodim @@ -3277,7 +3262,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: type_as(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -2956,6 +3712,8 @@ +@@ -2956,6 +3722,8 @@ dispatch: CPU: _unique2_cpu CUDA: _unique2_cuda @@ -3286,7 +3271,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _unsafe_view(Tensor self, int[] size) -> Tensor -@@ -2971,32 +3729,48 @@ +@@ -2971,32 +3739,48 @@ use_c10_dispatcher: full variants: function, method supports_named_tensor: True @@ -3335,7 +3320,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: view_as(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -3009,17 +3783,23 @@ +@@ -3009,13 +3793,19 @@ - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method @@ -3355,12 +3340,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor variants: function -- -+ - # VariableType::_weight_norm does not want to be given a gap in the autograd graph, - # so we don't define "dispatch" variants for it. - - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor -@@ -3041,13 +3821,21 @@ +@@ -3041,13 +3831,21 @@ - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -3382,7 +3362,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor use_c10_dispatcher: full -@@ -3100,25 +3888,37 @@ +@@ -3100,25 +3898,37 @@ - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor dispatch: @@ -3422,7 +3402,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor variants: function, method -@@ -3162,12 +3962,16 @@ +@@ -3162,12 +3972,16 @@ SparseCUDA: clone_sparse MkldnnCPU: mkldnn_clone QuantizedCPU: quantized_clone @@ -3439,7 +3419,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -3176,6 +3980,8 @@ +@@ -3176,6 +3990,8 @@ CUDA: pow_out SparseCPU: pow_out_sparse_scalar SparseCUDA: pow_out_sparse_scalar @@ -3448,7 +3428,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor use_c10_dispatcher: full -@@ -3186,6 +3992,8 @@ +@@ -3186,6 +4002,8 @@ CUDA: pow SparseCPU: pow_sparse_scalar SparseCUDA: pow_sparse_scalar @@ -3457,7 +3437,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: zero_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -3196,6 +4004,14 @@ +@@ -3196,6 +4014,14 @@ SparseCPU: zero_sparse_ SparseCUDA: zero_sparse_ MkldnnCPU: mkldnn_zero_ @@ -3472,7 +3452,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: -@@ -3204,6 +4020,8 @@ +@@ -3204,6 +4030,8 @@ SparseCPU: sub_out_sparse SparseCUDA: sub_out_sparse supports_named_tensor: True @@ -3481,7 +3461,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full -@@ -3213,6 +4031,8 @@ +@@ -3213,6 +4041,8 @@ CUDA: sub SparseCPU: sub_sparse SparseCUDA: sub_sparse @@ -3490,7 +3470,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) -@@ -3222,6 +4042,8 @@ +@@ -3222,6 +4052,8 @@ CUDA: sub_ SparseCPU: sub_sparse_ SparseCUDA: sub_sparse_ @@ -3499,7 +3479,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True # For C++ only, until we have conversion from C++ numbers to Tensor -@@ -3229,21 +4051,29 @@ +@@ -3229,21 +4061,29 @@ use_c10_dispatcher: full variants: function, method supports_named_tensor: True @@ -3529,7 +3509,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Functionally the same as addmm, but we give it a different derivative formula # that doesn't propagate gradients to non-present entries on sparse. -@@ -3257,6 +4087,8 @@ +@@ -3257,6 +4097,8 @@ CUDA: legacy::cuda::_th_addmm_out SparseCPU: addmm_out_sparse_dense_cpu SparseCUDA: addmm_out_sparse_dense_cuda @@ -3538,7 +3518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor -@@ -3267,6 +4099,8 @@ +@@ -3267,6 +4109,8 @@ CUDA: legacy::cuda::_th_addmm SparseCPU: addmm_sparse_dense_cpu SparseCUDA: addmm_sparse_dense_cuda @@ -3547,7 +3527,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) -@@ -3278,9 +4112,10 @@ +@@ -3278,9 +4122,10 @@ # broadcasting SparseCPU: s_addmm_sparse_dense_cpu_ SparseCUDA: s_addmm_sparse_dense_cuda_ @@ -3559,7 +3539,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # NOTE [ Sparse: autograd and API ] # # -@@ -3396,7 +4231,6 @@ +@@ -3396,7 +4241,6 @@ # shared. In other words, their outputs are non-differentiable views of the # sparse tensor. @@ -3567,7 +3547,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. - func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor -@@ -3433,7 +4267,6 @@ +@@ -3433,7 +4277,6 @@ SparseCUDA: sparse_resize_and_clear_ requires_tensor: True @@ -3575,7 +3555,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sparse_mask(Tensor self, Tensor mask) -> Tensor use_c10_dispatcher: full variants: method -@@ -3442,7 +4275,6 @@ +@@ -3442,7 +4285,6 @@ SparseCUDA: sparse_mask_cuda requires_tensor: True @@ -3583,7 +3563,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: to_dense(Tensor self) -> Tensor use_c10_dispatcher: full variants: method -@@ -3474,7 +4306,6 @@ +@@ -3474,7 +4316,6 @@ requires_tensor: True device_guard: False @@ -3591,7 +3571,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: dense_dim(Tensor self) -> int use_c10_dispatcher: full variants: method -@@ -3494,7 +4325,6 @@ +@@ -3494,7 +4335,6 @@ requires_tensor: True device_guard: False @@ -3599,7 +3579,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _nnz(Tensor self) -> int use_c10_dispatcher: full variants: method -@@ -3504,7 +4334,6 @@ +@@ -3504,7 +4344,6 @@ requires_tensor: True device_guard: False @@ -3607,7 +3587,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: coalesce(Tensor self) -> Tensor use_c10_dispatcher: full variants: method -@@ -3513,7 +4342,6 @@ +@@ -3513,7 +4352,6 @@ SparseCUDA: coalesce_sparse_cuda requires_tensor: True @@ -3615,7 +3595,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: is_coalesced(Tensor self) -> bool use_c10_dispatcher: full variants: method -@@ -3524,7 +4352,6 @@ +@@ -3524,7 +4362,6 @@ device_guard: False supports_named_tensor: True @@ -3623,7 +3603,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: -@@ -3568,7 +4395,6 @@ +@@ -3568,7 +4405,6 @@ requires_tensor: True device_guard: False @@ -3631,7 +3611,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: SparseCPU: hspmm_out_sparse_cpu -@@ -3630,11 +4456,15 @@ +@@ -3630,11 +4466,15 @@ variants: function dispatch: CPU: quantize_per_tensor_cpu @@ -3647,7 +3627,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: dequantize(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -3713,20 +4543,28 @@ +@@ -3713,20 +4553,28 @@ variants: method device_guard: False supports_named_tensor: True @@ -3676,7 +3656,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: meshgrid(Tensor[] tensors) -> Tensor[] -@@ -3765,6 +4603,8 @@ +@@ -3765,6 +4613,8 @@ dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda @@ -3685,7 +3665,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= variants: function supports_named_tensor: True -@@ -3791,10 +4631,16 @@ +@@ -3791,10 +4641,16 @@ # RNN cells and layers - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) @@ -3702,7 +3682,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) -@@ -3839,10 +4685,14 @@ +@@ -3839,10 +4695,14 @@ # PackedSequence utilities - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor) @@ -3717,7 +3697,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # wrappers for legacy TH methods -@@ -3852,6 +4702,8 @@ +@@ -3852,6 +4712,8 @@ dispatch: CPU: set_ CUDA: set_ @@ -3726,7 +3706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) variants: method -@@ -3860,6 +4712,8 @@ +@@ -3860,6 +4722,8 @@ CPU: legacy::cpu::_th_set_ CUDA: legacy::cuda::_th_set_ QuantizedCPU: set_storage @@ -3735,7 +3715,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) variants: method -@@ -3867,12 +4721,16 @@ +@@ -3867,12 +4731,16 @@ dispatch: CPU: set_tensor_ CUDA: set_tensor_ @@ -3752,7 +3732,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!) variants: method -@@ -3892,6 +4750,8 @@ +@@ -3892,6 +4760,8 @@ dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda @@ -3761,7 +3741,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor -@@ -3904,6 +4764,8 @@ +@@ -3904,6 +4774,8 @@ dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda @@ -3770,7 +3750,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor -@@ -3916,6 +4778,8 @@ +@@ -3916,6 +4788,8 @@ dispatch: CPU: masked_scatter__cpu CUDA: masked_scatter__cuda @@ -3779,7 +3759,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor use_c10_dispatcher: full -@@ -3929,25 +4793,35 @@ +@@ -3929,25 +4803,35 @@ CUDA: view MkldnnCPU: mkldnn_view QuantizedCPU: view @@ -3815,7 +3795,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) variants: method -@@ -3955,11 +4829,15 @@ +@@ -3955,11 +4839,15 @@ dispatch: CPU: legacy::cpu::_th_index_fill_ CUDA: legacy::cuda::_th_index_fill_ @@ -3831,7 +3811,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) variants: method -@@ -3967,11 +4845,15 @@ +@@ -3967,11 +4855,15 @@ CPU: index_fill_ CUDA: index_fill_ supports_named_tensor: True @@ -3847,7 +3827,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!) variants: method -@@ -3994,6 +4876,8 @@ +@@ -3994,6 +4886,8 @@ dispatch: CPU: scatter_cpu_ CUDA: legacy::cuda::_th_scatter_ @@ -3856,7 +3836,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor use_c10_dispatcher: full -@@ -4004,6 +4888,8 @@ +@@ -4004,6 +4898,8 @@ dispatch: CPU: scatter_fill_cpu_ CUDA: legacy::cuda::_th_scatter_ @@ -3865,7 +3845,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor use_c10_dispatcher: full -@@ -4020,81 +4906,127 @@ +@@ -4020,81 +4916,127 @@ dispatch: CPU: scatter_add_cpu_ CUDA: legacy::cuda::_th_scatter_add_ @@ -3966,16 +3946,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor variants: method, function -- + npu_dispatch: + NPU: bitwise_and_npu -+ + - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) variants: method -- + npu_dispatch: + NPU: bitwise_and_npu_ -+ + - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method + npu_dispatch: @@ -3995,7 +3973,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) variants: method -@@ -4107,70 +5039,106 @@ +@@ -4107,70 +5049,106 @@ dispatch: CPU: bitwise_or_out CUDA: bitwise_or_out @@ -4102,7 +4080,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) variants: method -@@ -4240,18 +5208,24 @@ +@@ -4240,18 +5218,24 @@ - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) supports_named_tensor: True variants: method @@ -4127,7 +4105,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: digamma_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -4266,6 +5240,8 @@ +@@ -4266,6 +5250,8 @@ dispatch: CPU: legacy::cpu::_th_renorm_ CUDA: legacy::cuda::_th_renorm_ @@ -4136,7 +4114,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) supports_named_tensor: True -@@ -4273,6 +5249,8 @@ +@@ -4273,6 +5259,8 @@ dispatch: CPU: pow_ CUDA: pow_ @@ -4145,7 +4123,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) supports_named_tensor: True -@@ -4280,53 +5258,71 @@ +@@ -4280,53 +5268,71 @@ dispatch: CPU: pow_ CUDA: pow_ @@ -4206,23 +4184,23 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= CPU: legacy::cpu::_th_addbmm_ CUDA: legacy::cuda::_th_addbmm_ + npu_dispatch: -+ NPU: addbmm_npu_ ++ NPU: addbmm_npu_ - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: legacy::cpu::_th_addbmm_out CUDA: legacy::cuda::_th_addbmm_out + npu_dispatch: -+ NPU: addbmm_out_npu ++ NPU: addbmm_out_npu - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor use_c10_dispatcher: full -@@ -4334,28 +5330,40 @@ +@@ -4334,28 +5340,40 @@ dispatch: CPU: legacy::cpu::_th_addbmm CUDA: legacy::cuda::_th_addbmm + npu_dispatch: -+ NPU: addbmm_npu ++ NPU: addbmm_npu - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) variants: method @@ -4258,7 +4236,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) -@@ -4380,6 +5388,8 @@ +@@ -4380,6 +5398,8 @@ dispatch: CPU: legacy::cpu::_th_diag_out CUDA: legacy::cuda::_th_diag_out @@ -4267,7 +4245,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: diag(Tensor self, int diagonal=0) -> Tensor use_c10_dispatcher: full -@@ -4387,30 +5397,44 @@ +@@ -4387,30 +5407,44 @@ dispatch: CPU: legacy::cpu::_th_diag CUDA: legacy::cuda::_th_diag @@ -4312,7 +4290,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: -@@ -4435,6 +5459,8 @@ +@@ -4435,6 +5469,8 @@ CPU: ne_out CUDA: ne_out QuantizedCPU: ne_out_quantized_cpu @@ -4321,7 +4299,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ne.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4444,6 +5470,8 @@ +@@ -4444,6 +5480,8 @@ CPU: ne CUDA: ne QuantizedCPU: ne_quantized_cpu @@ -4330,7 +4308,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4451,6 +5479,8 @@ +@@ -4451,6 +5489,8 @@ CPU: ne_out CUDA: ne_out QuantizedCPU: ne_out_quantized_cpu @@ -4339,7 +4317,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ne.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4460,6 +5490,8 @@ +@@ -4460,6 +5500,8 @@ CPU: ne CUDA: ne QuantizedCPU: ne_quantized_cpu @@ -4348,7 +4326,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4467,6 +5499,8 @@ +@@ -4467,6 +5509,8 @@ CPU: eq_out CUDA: eq_out QuantizedCPU: eq_out_quantized_cpu @@ -4357,7 +4335,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4476,6 +5510,8 @@ +@@ -4476,6 +5520,8 @@ CPU: eq CUDA: eq QuantizedCPU: eq_quantized_cpu @@ -4366,7 +4344,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4483,6 +5519,8 @@ +@@ -4483,6 +5529,8 @@ CPU: eq_out CUDA: eq_out QuantizedCPU: eq_out_quantized_cpu @@ -4375,7 +4353,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4492,6 +5530,8 @@ +@@ -4492,6 +5540,8 @@ CPU: eq CUDA: eq QuantizedCPU: eq_quantized_cpu @@ -4384,7 +4362,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4499,6 +5539,8 @@ +@@ -4499,6 +5549,8 @@ CPU: ge_out CUDA: ge_out QuantizedCPU: ge_out_quantized_cpu @@ -4393,7 +4371,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4508,6 +5550,8 @@ +@@ -4508,6 +5560,8 @@ CPU: ge CUDA: ge QuantizedCPU: ge_quantized_cpu @@ -4402,7 +4380,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4515,6 +5559,8 @@ +@@ -4515,6 +5569,8 @@ CPU: ge_out CUDA: ge_out QuantizedCPU: ge_out_quantized_cpu @@ -4411,7 +4389,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4524,6 +5570,8 @@ +@@ -4524,6 +5580,8 @@ CPU: ge CUDA: ge QuantizedCPU: ge_quantized_cpu @@ -4420,7 +4398,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4531,6 +5579,8 @@ +@@ -4531,6 +5589,8 @@ CPU: le_out CUDA: le_out QuantizedCPU: le_out_quantized_cpu @@ -4429,7 +4407,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4540,6 +5590,8 @@ +@@ -4540,6 +5600,8 @@ CPU: le CUDA: le QuantizedCPU: le_quantized_cpu @@ -4438,7 +4416,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4547,6 +5599,8 @@ +@@ -4547,6 +5609,8 @@ CPU: le_out CUDA: le_out QuantizedCPU: le_out_quantized_cpu @@ -4447,7 +4425,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4556,6 +5610,8 @@ +@@ -4556,6 +5620,8 @@ CPU: le CUDA: le QuantizedCPU: le_quantized_cpu @@ -4456,7 +4434,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4563,6 +5619,8 @@ +@@ -4563,6 +5629,8 @@ CPU: gt_out CUDA: gt_out QuantizedCPU: gt_out_quantized_cpu @@ -4465,7 +4443,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4572,6 +5630,8 @@ +@@ -4572,6 +5640,8 @@ CPU: gt CUDA: gt QuantizedCPU: gt_quantized_cpu @@ -4474,7 +4452,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4579,6 +5639,8 @@ +@@ -4579,6 +5649,8 @@ CPU: gt_out CUDA: gt_out QuantizedCPU: gt_out_quantized_cpu @@ -4483,7 +4461,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4588,6 +5650,8 @@ +@@ -4588,6 +5660,8 @@ CPU: gt CUDA: gt QuantizedCPU: gt_quantized_cpu @@ -4492,7 +4470,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4595,6 +5659,8 @@ +@@ -4595,6 +5669,8 @@ CPU: lt_out CUDA: lt_out QuantizedCPU: lt_out_quantized_cpu @@ -4501,7 +4479,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4604,6 +5670,8 @@ +@@ -4604,6 +5680,8 @@ CPU: lt CUDA: lt QuantizedCPU: lt_quantized_cpu @@ -4510,7 +4488,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4611,6 +5679,8 @@ +@@ -4611,6 +5689,8 @@ CPU: lt_out CUDA: lt_out QuantizedCPU: lt_out_quantized_cpu @@ -4519,7 +4497,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4620,11 +5690,16 @@ +@@ -4620,11 +5700,16 @@ CPU: lt CUDA: lt QuantizedCPU: lt_quantized_cpu @@ -4536,7 +4514,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: take(Tensor self, Tensor index) -> Tensor use_c10_dispatcher: full -@@ -4632,11 +5707,16 @@ +@@ -4632,11 +5717,16 @@ dispatch: CPU: legacy::cpu::_th_take CUDA: legacy::cuda::_th_take @@ -4553,7 +4531,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_select(Tensor self, int dim, Tensor index) -> Tensor use_c10_dispatcher: full -@@ -4646,17 +5726,25 @@ +@@ -4646,17 +5736,25 @@ CUDA: legacy::cuda::_th_index_select SparseCPU: index_select_sparse SparseCUDA: index_select_sparse @@ -4579,7 +4557,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: masked_select(Tensor self, Tensor mask) -> Tensor use_c10_dispatcher: full -@@ -4665,11 +5753,15 @@ +@@ -4665,11 +5763,15 @@ CPU: masked_select_cpu CUDA: masked_select_cuda supports_named_tensor: True @@ -4595,7 +4573,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: nonzero(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -4677,6 +5769,8 @@ +@@ -4677,6 +5779,8 @@ dispatch: CPU: legacy::cpu::_th_nonzero CUDA: legacy::cuda::_th_nonzero @@ -4604,7 +4582,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: nonzero_numpy(Tensor self) -> Tensor[] variants: method, function -@@ -4685,6 +5779,8 @@ +@@ -4685,6 +5789,8 @@ dispatch: CPU: gather_out_cpu CUDA: gather_out_cuda @@ -4613,7 +4591,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor use_c10_dispatcher: full -@@ -4692,34 +5788,50 @@ +@@ -4692,34 +5798,50 @@ dispatch: CPU: gather_cpu CUDA: gather_cuda @@ -4621,10 +4599,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + NPU: gather_npu - func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) -- + npu_dispatch: + NPU: gather_out_npu -+ + - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor variants: method, function + npu_dispatch: @@ -4665,7 +4642,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) dispatch: -@@ -4826,9 +5938,13 @@ +@@ -4826,9 +5948,13 @@ CUDA: legacy::cuda::_th_potri - func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) @@ -4679,7 +4656,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _qr_helper(Tensor self, bool some) -> (Tensor, Tensor) variants: function -@@ -4891,12 +6007,16 @@ +@@ -4891,12 +6017,16 @@ dispatch: CPU: multinomial_out CUDA: multinomial_out @@ -4696,7 +4673,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor) variants: function -@@ -4947,6 +6067,8 @@ +@@ -4947,6 +6077,8 @@ dispatch: CPU: erfinv CUDA: erfinv @@ -4705,7 +4682,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: erfinv_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -4954,26 +6076,36 @@ +@@ -4954,26 +6086,36 @@ dispatch: CPU: _erfinv__cpu CUDA: _erfinv__cuda @@ -4742,7 +4719,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor use_c10_dispatcher: full -@@ -4981,21 +6113,29 @@ +@@ -4981,21 +6123,29 @@ - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True @@ -4772,7 +4749,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor use_c10_dispatcher: full -@@ -5003,6 +6143,8 @@ +@@ -5003,6 +6153,8 @@ dispatch: CPU: lerp_cpu_scalar CUDA: lerp_cuda_scalar @@ -4781,7 +4758,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor use_c10_dispatcher: full -@@ -5010,11 +6152,15 @@ +@@ -5010,11 +6162,15 @@ dispatch: CPU: lerp_cpu_tensor CUDA: lerp_cuda_tensor @@ -4797,7 +4774,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor use_c10_dispatcher: full -@@ -5022,11 +6168,15 @@ +@@ -5022,11 +6178,15 @@ dispatch: CPU: legacy::cpu::_th_histc CUDA: _histc_cuda @@ -4813,7 +4790,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full -@@ -5034,11 +6184,15 @@ +@@ -5034,11 +6194,15 @@ dispatch: CPU: fmod CUDA: legacy::cuda::_th_fmod @@ -4829,7 +4806,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -5046,11 +6200,15 @@ +@@ -5046,11 +6210,15 @@ dispatch: CPU: fmod CUDA: legacy::cuda::_th_fmod @@ -4845,7 +4822,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full -@@ -5058,11 +6216,15 @@ +@@ -5058,11 +6226,15 @@ dispatch: CPU: remainder CUDA: remainder @@ -4861,7 +4838,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -5070,12 +6232,18 @@ +@@ -5070,12 +6242,18 @@ dispatch: CPU: remainder CUDA: remainder @@ -4880,7 +4857,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: min(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5084,13 +6252,19 @@ +@@ -5084,13 +6262,19 @@ CPU: min CUDA: legacy::cuda::_th_min QuantizedCPU: min_quant @@ -4900,7 +4877,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: max(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5099,6 +6273,8 @@ +@@ -5099,6 +6283,8 @@ CPU: max CUDA: legacy::cuda::_th_max QuantizedCPU: max_quant @@ -4909,7 +4886,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: median(Tensor self) -> Tensor -@@ -5107,12 +6283,16 @@ +@@ -5107,12 +6293,16 @@ dispatch: CPU: median_cpu CUDA: median_cuda @@ -4926,7 +4903,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) variants: method, function -@@ -5120,23 +6300,45 @@ +@@ -5120,23 +6310,45 @@ CPU: legacy::cpu::_th_sort CUDA: legacy::cuda::_th_sort QuantizedCPU: sort_quant @@ -4972,7 +4949,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) variants: method, function -@@ -5144,11 +6346,15 @@ +@@ -5144,11 +6356,15 @@ CPU: topk CUDA: topk QuantizedCPU: quantized_topk_cpu @@ -4988,7 +4965,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: any(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5159,11 +6365,15 @@ +@@ -5159,11 +6375,15 @@ CUDA: any SparseCPU: any_sparse SparseCUDA: any_sparse @@ -5004,7 +4981,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor use_c10_dispatcher: full -@@ -5171,6 +6381,8 @@ +@@ -5171,6 +6391,8 @@ dispatch: CPU: legacy::cpu::_th_renorm CUDA: legacy::cuda::_th_renorm @@ -5013,7 +4990,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) variants: method -@@ -5178,6 +6390,8 @@ +@@ -5178,6 +6400,8 @@ dispatch: CPU: unfold CUDA: unfold @@ -5022,7 +4999,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: equal(Tensor self, Tensor other) -> bool use_c10_dispatcher: full -@@ -5186,6 +6400,8 @@ +@@ -5186,6 +6410,8 @@ CPU: legacy::cpu::_th_equal CUDA: legacy::cuda::_th_equal QuantizedCPU: quantized_equal @@ -5031,7 +5008,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) -@@ -5193,6 +6409,8 @@ +@@ -5193,6 +6419,8 @@ dispatch: CPU: pow_out CUDA: pow_out @@ -5040,7 +5017,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor use_c10_dispatcher: full -@@ -5201,12 +6419,16 @@ +@@ -5201,12 +6429,16 @@ dispatch: CPU: pow CUDA: pow @@ -5057,7 +5034,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor use_c10_dispatcher: full -@@ -5214,6 +6436,8 @@ +@@ -5214,6 +6446,8 @@ dispatch: CPU: pow CUDA: pow @@ -5066,7 +5043,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) variants: method -@@ -5221,40 +6445,58 @@ +@@ -5221,40 +6455,58 @@ CPU: normal_cpu_ CUDA: normal_cuda_ supports_named_tensor: True @@ -5125,7 +5102,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: alias(Tensor(a) self) -> Tensor(a) variants: method, function -@@ -5265,16 +6507,22 @@ +@@ -5265,16 +6517,22 @@ dispatch: CPU: legacy::cpu::_th_addr CUDA: legacy::cuda::_th_addr @@ -5148,7 +5125,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) dispatch: -@@ -5286,22 +6534,30 @@ +@@ -5286,22 +6544,30 @@ dispatch: CPU: _cumsum_cpu CUDA: legacy::cuda::_th_cumsum @@ -5179,7 +5156,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _var(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full -@@ -5309,6 +6565,8 @@ +@@ -5309,6 +6575,8 @@ CPU: legacy::cpu::_th_var CUDA: legacy::cuda::_th_var supports_named_tensor: True @@ -5188,7 +5165,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _std(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full -@@ -5321,6 +6579,8 @@ +@@ -5321,6 +6589,8 @@ variants: function dispatch: CUDA: _amp_non_finite_check_and_unscale_cuda_ @@ -5197,7 +5174,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor variants: function -@@ -5332,12 +6592,16 @@ +@@ -5332,12 +6602,16 @@ CPU: _cat_cpu CUDA: cat_cuda QuantizedCPU: quantized_cat @@ -5214,7 +5191,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor) dispatch: -@@ -5353,36 +6617,50 @@ +@@ -5353,36 +6627,50 @@ dispatch: CPU: legacy::cpu::_th_max CUDA: legacy::cuda::_th_max @@ -5265,7 +5242,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor use_c10_dispatcher: full -@@ -5390,23 +6668,33 @@ +@@ -5390,23 +6678,33 @@ dispatch: CPU: mse_loss_backward CUDA: mse_loss_backward @@ -5299,7 +5276,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5434,28 +6722,38 @@ +@@ -5434,22 +6732,30 @@ - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5330,20 +5307,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn - dispatch: - CPU: multilabel_margin_loss_backward_cpu_out - CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out -+ npu_dispatch: -+ NPU: multilabel_margin_loss_backward_npu_out - - - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor - use_c10_dispatcher: full -@@ -5463,100 +6761,142 @@ - dispatch: - CPU: multilabel_margin_loss_backward_cpu - CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward -+ npu_dispatch: -+ NPU: multilabel_margin_loss_backward_npu +@@ -5466,97 +6772,137 @@ - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5481,7 +5445,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5564,6 +6904,8 @@ +@@ -5564,6 +6910,8 @@ CPU: elu_out CUDA: elu_out QuantizedCPU: quantized_elu_out @@ -5490,7 +5454,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor use_c10_dispatcher: full -@@ -5572,16 +6914,22 @@ +@@ -5572,16 +6920,22 @@ CPU: elu CUDA: elu QuantizedCPU: quantized_elu @@ -5513,7 +5477,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) python_module: nn -@@ -5589,12 +6937,16 @@ +@@ -5589,12 +6943,16 @@ CPU: elu_ CUDA: elu_ QuantizedCPU: quantized_elu_ @@ -5530,7 +5494,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: glu(Tensor self, int dim=-1) -> Tensor use_c10_dispatcher: full -@@ -5602,12 +6954,16 @@ +@@ -5602,12 +6960,16 @@ dispatch: CPU: glu CUDA: legacy::cuda::_thnn_glu_forward @@ -5547,7 +5511,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor use_c10_dispatcher: full -@@ -5615,20 +6971,30 @@ +@@ -5615,20 +6977,30 @@ dispatch: CPU: glu_backward CUDA: legacy::cuda::_thnn_glu_backward @@ -5578,7 +5542,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5636,6 +7002,8 @@ +@@ -5636,6 +7008,8 @@ CPU: hardtanh_out CUDA: hardtanh_out QuantizedCPU: quantized_hardtanh_out @@ -5587,7 +5551,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor use_c10_dispatcher: full -@@ -5644,16 +7012,22 @@ +@@ -5644,16 +7018,22 @@ CPU: hardtanh CUDA: hardtanh QuantizedCPU: quantized_hardtanh @@ -5610,7 +5574,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) python_module: nn -@@ -5661,6 +7035,8 @@ +@@ -5661,6 +7041,8 @@ CPU: hardtanh_ CUDA: hardtanh_ QuantizedCPU: quantized_hardtanh_ @@ -5619,7 +5583,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5668,6 +7044,8 @@ +@@ -5668,6 +7050,8 @@ CPU: leaky_relu_out CUDA: leaky_relu_out QuantizedCPU: quantized_leaky_relu_out @@ -5628,7 +5592,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor use_c10_dispatcher: full -@@ -5676,10 +7054,14 @@ +@@ -5676,10 +7060,14 @@ CPU: leaky_relu CUDA: leaky_relu QuantizedCPU: quantized_leaky_relu @@ -5643,7 +5607,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) python_module: nn -@@ -5687,31 +7069,44 @@ +@@ -5687,31 +7075,44 @@ CPU: leaky_relu_ CUDA: leaky_relu_ QuantizedCPU: quantized_leaky_relu_ @@ -5688,7 +5652,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor use_c10_dispatcher: full -@@ -5719,6 +7114,8 @@ +@@ -5719,6 +7120,8 @@ dispatch: CPU: log_sigmoid_backward_cpu CUDA: legacy::cuda::_thnn_log_sigmoid_backward @@ -5697,7 +5661,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5744,37 +7141,53 @@ +@@ -5744,37 +7147,53 @@ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5751,7 +5715,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5782,9 +7195,13 @@ +@@ -5782,9 +7201,13 @@ CPU: adaptive_avg_pool2d_out_cpu CUDA: adaptive_avg_pool2d_out_cuda MkldnnCPU: mkldnn_adaptive_avg_pool2d_out @@ -5765,7 +5729,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor dispatch: -@@ -5796,6 +7213,8 @@ +@@ -5796,6 +7219,8 @@ CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda QuantizedCPU: quantized_adaptive_avg_pool2d @@ -5774,7 +5738,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5803,24 +7222,32 @@ +@@ -5803,24 +7228,32 @@ dispatch: CPU: adaptive_avg_pool2d_backward_cpu CUDA: adaptive_avg_pool2d_backward_cuda @@ -5807,7 +5771,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5828,6 +7255,8 @@ +@@ -5828,6 +7261,8 @@ dispatch: CPU: adaptive_avg_pool3d_backward_cpu CUDA: adaptive_avg_pool3d_backward_cuda @@ -5816,7 +5780,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -5835,6 +7264,8 @@ +@@ -5835,6 +7270,8 @@ dispatch: CPU: adaptive_max_pool2d_out_cpu CUDA: adaptive_max_pool2d_out_cuda @@ -5825,7 +5789,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor) -@@ -5842,12 +7273,16 @@ +@@ -5842,12 +7279,16 @@ dispatch: CPU: adaptive_max_pool2d_cpu CUDA: adaptive_max_pool2d_cuda @@ -5842,7 +5806,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor use_c10_dispatcher: full -@@ -5855,6 +7290,8 @@ +@@ -5855,6 +7296,8 @@ dispatch: CPU: adaptive_max_pool2d_backward_cpu CUDA: adaptive_max_pool2d_backward_cuda @@ -5851,7 +5815,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -5889,6 +7326,8 @@ +@@ -5889,6 +7332,8 @@ CPU: avg_pool2d_out_cpu CUDA: avg_pool2d_out_cuda MkldnnCPU: mkldnn_avg_pool2d_out @@ -5860,7 +5824,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor python_module: nn -@@ -5897,24 +7336,32 @@ +@@ -5897,24 +7342,32 @@ CUDA: avg_pool2d_cuda MkldnnCPU: mkldnn_avg_pool2d QuantizedCPU: quantized_avg_pool2d @@ -5893,7 +5857,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor python_module: nn -@@ -5922,18 +7369,24 @@ +@@ -5922,18 +7375,24 @@ CPU: avg_pool3d_cpu CUDA: avg_pool3d_cuda QuantizedCPU: quantized_avg_pool3d @@ -5918,7 +5882,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -5993,6 +7446,8 @@ +@@ -5993,6 +7452,8 @@ dispatch: CPU: max_pool2d_with_indices_out_cpu CUDA: max_pool2d_with_indices_out_cuda @@ -5927,7 +5891,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) -@@ -6000,6 +7455,8 @@ +@@ -6000,6 +7461,8 @@ dispatch: CPU: max_pool2d_with_indices_cpu CUDA: max_pool2d_with_indices_cuda @@ -5936,7 +5900,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) -@@ -6007,12 +7464,16 @@ +@@ -6007,12 +7470,16 @@ dispatch: CPU: max_pool2d_with_indices_backward_out_cpu CUDA: max_pool2d_with_indices_backward_out_cuda @@ -5953,7 +5917,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -6020,6 +7481,8 @@ +@@ -6020,6 +7487,8 @@ dispatch: CPU: max_pool3d_with_indices_out_cpu CUDA: max_pool3d_with_indices_out_cuda @@ -5962,7 +5926,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) -@@ -6027,6 +7490,8 @@ +@@ -6027,6 +7496,8 @@ dispatch: CPU: max_pool3d_with_indices_cpu CUDA: max_pool3d_with_indices_cuda @@ -5971,7 +5935,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) -@@ -6034,12 +7499,17 @@ +@@ -6034,12 +7505,17 @@ dispatch: CPU: max_pool3d_with_indices_backward_out_cpu CUDA: max_pool3d_with_indices_backward_out_cuda @@ -5989,7 +5953,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6118,12 +7588,16 @@ +@@ -6118,12 +7594,16 @@ dispatch: CPU: reflection_pad2d_out_cpu CUDA: reflection_pad2d_out_cuda @@ -6006,7 +5970,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn -@@ -6166,12 +7640,16 @@ +@@ -6166,12 +7646,16 @@ dispatch: CPU: replication_pad2d_out_cpu CUDA: replication_pad2d_out_cuda @@ -6023,7 +5987,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn -@@ -6214,12 +7692,16 @@ +@@ -6214,12 +7698,16 @@ dispatch: CPU: upsample_linear1d_out_cpu CUDA: upsample_linear1d_out_cuda @@ -6040,7 +6004,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn -@@ -6232,12 +7714,16 @@ +@@ -6232,12 +7720,16 @@ dispatch: CPU: upsample_linear1d_backward_cpu CUDA: upsample_linear1d_backward_cuda @@ -6057,7 +6021,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn -@@ -6245,42 +7731,56 @@ +@@ -6245,96 +7737,128 @@ CPU: upsample_bilinear2d_cpu CUDA: upsample_bilinear2d_cuda QuantizedCPU: quantized_upsample_bilinear2d_cpu @@ -6085,19 +6049,17 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= dispatch: CPU: upsample_bicubic2d_out_cpu CUDA: upsample_bicubic2d_out_cuda -- + npu_dispatch: + NPU: upsample_bicubic2d_out_npu -+ + - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn dispatch: CPU: upsample_bicubic2d_cpu CUDA: upsample_bicubic2d_cuda -- + npu_dispatch: + NPU: upsample_bicubic2d_npu -+ + - func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: @@ -6116,7 +6078,38 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_trilinear3d.out(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6311,30 +7811,40 @@ + dispatch: + CPU: upsample_trilinear3d_out_cpu + CUDA: upsample_trilinear3d_out_cuda ++ npu_dispatch: ++ NPU: upsample_trilinear3d_out_npu + + - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + dispatch: + CPU: upsample_trilinear3d_cpu + CUDA: upsample_trilinear3d_cuda ++ npu_dispatch: ++ NPU: upsample_trilinear3d_npu + + - func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: upsample_trilinear3d_backward_out_cpu + CUDA: upsample_trilinear3d_backward_out_cuda ++ npu_dispatch: ++ NPU: upsample_trilinear3d_backward_out_npu + + - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + dispatch: + CPU: upsample_trilinear3d_backward_cpu + CUDA: upsample_trilinear3d_backward_cuda ++ npu_dispatch: ++ NPU: upsample_trilinear3d_backward_npu + + - func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) + python_module: nn dispatch: CPU: upsample_nearest1d_out_cpu CUDA: upsample_nearest1d_out_cuda @@ -6157,7 +6150,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn -@@ -6342,18 +7852,24 @@ +@@ -6342,24 +7866,32 @@ CPU: upsample_nearest2d_cpu CUDA: upsample_nearest2d_cuda QuantizedCPU: quantized_upsample_nearest2d_cpu @@ -6182,7 +6175,39 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_nearest3d.out(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6385,20 +7901,28 @@ + dispatch: + CPU: upsample_nearest3d_out_cpu + CUDA: upsample_nearest3d_out_cuda ++ npu_dispatch: ++ NPU: upsample_nearest3d_out_npu + + - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn +@@ -6367,38 +7899,52 @@ + CPU: upsample_nearest3d_cpu + CUDA: upsample_nearest3d_cuda + QuantizedCPU: quantized_upsample_nearest3d_cpu ++ npu_dispatch: ++ NPU: upsample_nearest3d_npu + + - func: upsample_nearest3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn + dispatch: + CPU: upsample_nearest3d_backward_out_cpu + CUDA: upsample_nearest3d_backward_out_cuda ++ npu_dispatch: ++ NPU: upsample_nearest3d_backward_out_npu + + - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor + python_module: nn + dispatch: + CPU: upsample_nearest3d_backward_cpu + CUDA: upsample_nearest3d_backward_cuda ++ npu_dispatch: ++ NPU: upsample_nearest3d_backward_npu + + - func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) + python_module: nn dispatch: CPU: sigmoid_backward_out CUDA: sigmoid_backward_out @@ -6211,7 +6236,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # What's a thnn_conv_ versus a slow_conv_? # -@@ -6423,24 +7947,32 @@ +@@ -6423,24 +7969,32 @@ dispatch: CPU: slow_conv_transpose2d_out_cpu CUDA: slow_conv_transpose2d_out_cuda @@ -6244,7 +6269,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6468,21 +8000,29 @@ +@@ -6468,21 +8022,29 @@ - func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -6274,7 +6299,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn -@@ -6495,32 +8035,46 @@ +@@ -6495,32 +8057,46 @@ dispatch: CPU: slow_conv2d_backward_cpu CUDA: legacy::cuda::_thnn_conv2d_backward @@ -6321,7 +6346,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6553,12 +8107,16 @@ +@@ -6553,12 +8129,16 @@ dispatch: CPU: slow_conv_dilated2d_cpu CUDA: slow_conv_dilated2d_cuda @@ -6338,7 +6363,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor python_module: nn -@@ -6577,57 +8135,396 @@ +@@ -6577,57 +8157,393 @@ dispatch: CPU: col2im_out_cpu CUDA: col2im_out_cuda @@ -6540,7 +6565,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + npu_dispatch_only: + NPU: ptiou_npu + -+- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor) ++- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor) + variants: function + npu_dispatch_only: + NPU: nms_with_mask_npu @@ -6613,7 +6638,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + variants: function, method + npu_dispatch_only: + NPU: indexing_npu -+ ++ +- func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, *, Tensor(a!) out) -> Tensor(a!) + npu_dispatch_only: + NPU: indexing_out_npu @@ -6642,7 +6667,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +- func: npu_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar beta1_power, Scalar beta2_power, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, bool? use_locking, bool? use_nesterov) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + npu_dispatch_only: + NPU: apply_adam_npu -+ ++ +- func: npu_layer_norm_eval(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05) -> Tensor + npu_dispatch_only: + NPU: layer_norm_eval_npu @@ -6671,7 +6696,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + npu_dispatch_only: + NPU: confusion_transpose_backward_npu + -+- func: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor ++- func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor + variants: function, method + npu_dispatch_only: + NPU: bmm_v2_npu @@ -6719,14 +6744,6 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + npu_dispatch_only: + NPU: grid_assign_positive_npu + -+- func: global_step_inc() -> () -+ variants: function -+ use_c10_dispatcher: full -+ -+- func: set_start_fuzz_compile_step(int step) -> () -+ variants: function -+ use_c10_dispatcher: full -+ +- func: npu_mish_backward(Tensor grad, Tensor input) -> Tensor + npu_dispatch_only: + NPU: mish_backward_npu @@ -6735,10 +6752,15 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + variants: function, method + npu_dispatch_only: + NPU: normalize_batch_npu ++ ++- func: npu_masked_fill_range(Tensor self, Tensor start, Tensor end, Tensor value, int axis=-1) -> Tensor ++ variants: function, method ++ npu_dispatch_only: ++ NPU: masked_fill_range_npu \ No newline at end of file -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S --- pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S 2021-06-25 16:37:35.566259444 +0800 ++++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S 2021-07-05 14:59:26.496336915 +0800 @@ -659,14 +659,14 @@ SUB x1, x1, 4 @@ -6762,9 +6784,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= 5: CMP x1, 2 -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp --- pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp 2021-06-25 16:37:35.510259016 +0800 ++++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp 2021-07-05 14:59:26.440336488 +0800 @@ -64,7 +64,7 @@ Tensor isinf(const Tensor &self) { @@ -6774,9 +6796,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return at::zeros_like(self, at::kBool, at::MemoryFormat::Preserve); } return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "isinf", [&]() { -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp --- pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp 2021-06-25 16:37:35.510259016 +0800 ++++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp 2021-07-05 14:59:26.444336518 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -6819,9 +6841,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } else { allocator = at::getCPUAllocator(); } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp --- pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp 2021-06-25 16:37:35.510259016 +0800 ++++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp 2021-07-05 14:59:26.444336518 +0800 @@ -87,6 +87,7 @@ if (self.is_contiguous(memory_format)) { return self; @@ -6830,9 +6852,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= TORCH_CHECK( memory_format != MemoryFormat::Preserve, "preserve memory format is unsupported by the contiguous operator"); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp --- pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp 2021-06-25 16:37:35.514259047 +0800 ++++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp 2021-07-05 14:59:26.444336518 +0800 @@ -26,7 +26,7 @@ const scalar_t* in = &idata[output_y * input_width + output_x]; scalar_t* out = &odata[output_y * output_width + output_x]; @@ -6842,9 +6864,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= out[0] = in[0]; in += input_width * input_height; out += output_width * output_height; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py --- pytorch-v1.5.0/aten/src/ATen/native_parse.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native_parse.py 2021-06-25 16:37:35.582259566 +0800 ++++ pytorch-develop/aten/src/ATen/native_parse.py 2021-07-05 14:59:26.512337037 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -6880,9 +6902,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= declarations.append(declaration) except Exception as e: msg = '''Exception raised in processing function: -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py --- pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/preprocess_declarations.py 2021-06-25 16:37:35.582259566 +0800 ++++ pytorch-develop/aten/src/ATen/preprocess_declarations.py 2021-07-05 14:59:26.512337037 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -6912,9 +6934,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= default_backends = ['CPU', 'CUDA'] -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h --- pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/templates/TensorBody.h 2021-06-25 16:37:35.582259566 +0800 ++++ pytorch-develop/aten/src/ATen/templates/TensorBody.h 2021-07-05 14:59:26.512337037 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -6945,9 +6967,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= /// Returns if a `Tensor` has HIP backend. bool is_hip() const; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h --- pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h 2021-06-25 16:37:35.582259566 +0800 ++++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h 2021-07-05 14:59:26.512337037 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -6979,9 +7001,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= inline NamedTensorMeta* Tensor::get_named_tensor_meta() { return static_cast(impl_->named_tensor_meta()); } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt --- pytorch-v1.5.0/aten/src/TH/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/TH/CMakeLists.txt 2021-06-25 16:37:35.586259596 +0800 ++++ pytorch-develop/aten/src/TH/CMakeLists.txt 2021-07-05 14:59:26.516337067 +0800 @@ -48,6 +48,11 @@ ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE) @@ -6994,9 +7016,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h") -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/TH/generic/THStorage.cpp 2021-06-25 16:37:35.586259596 +0800 ++++ pytorch-develop/aten/src/TH/generic/THStorage.cpp 2021-07-05 14:59:26.520337098 +0800 @@ -1,9 +1,32 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7103,9 +7125,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return THStorage_(data)(self)[idx]; } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/TH/generic/THStorage.h 2021-06-25 16:37:35.586259596 +0800 ++++ pytorch-develop/aten/src/TH/generic/THStorage.h 2021-07-05 14:59:26.520337098 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7142,9 +7164,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= TH_API THStorage* THStorage_(newWithSize1)(scalar_t); TH_API THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt --- pytorch-v1.5.0/c10/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/CMakeLists.txt 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/CMakeLists.txt 2021-07-05 14:59:26.532337189 +0800 @@ -63,6 +63,14 @@ message(STATUS "don't use NUMA") endif() @@ -7171,9 +7193,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if(USE_ROCM) # NB: This directory is generated by the HIPIFY script; it's # not checked in -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h --- pytorch-v1.5.0/c10/core/Backend.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Backend.h 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/core/Backend.h 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7266,9 +7288,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= default: return "UNKNOWN_BACKEND"; } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp --- pytorch-v1.5.0/c10/core/Device.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Device.cpp 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/core/Device.cpp 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7306,9 +7328,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= }}; auto device = std::find_if( types.begin(), -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h --- pytorch-v1.5.0/c10/core/Device.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Device.h 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/core/Device.h 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7341,9 +7363,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= /// Return true if the device is of CPU type. bool is_cpu() const noexcept { return type_ == DeviceType::CPU; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp --- pytorch-v1.5.0/c10/core/DeviceType.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DeviceType.cpp 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/core/DeviceType.cpp 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7381,9 +7403,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return true; default: return false; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h --- pytorch-v1.5.0/c10/core/DeviceType.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DeviceType.h 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/core/DeviceType.h 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7424,9 +7446,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= constexpr DeviceType kHIP = DeviceType::HIP; constexpr DeviceType kMSNPU = DeviceType::MSNPU; constexpr DeviceType kXLA = DeviceType::XLA; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp --- pytorch-v1.5.0/c10/core/DispatchKey.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DispatchKey.cpp 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/core/DispatchKey.cpp 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7456,9 +7478,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= case DispatchKey::BackendSelect: return "BackendSelect"; case DispatchKey::TESTING_ONLY_GenericModeTensorId: -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h --- pytorch-v1.5.0/c10/core/DispatchKey.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DispatchKey.h 2021-06-25 16:37:35.598259688 +0800 ++++ pytorch-develop/c10/core/DispatchKey.h 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7488,9 +7510,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= PrivateUse2_TensorId, PrivateUse3_TensorId, -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h --- pytorch-v1.5.0/c10/core/Storage.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Storage.h 2021-06-25 16:37:35.602259718 +0800 ++++ pytorch-develop/c10/core/Storage.h 2021-07-05 14:59:26.532337189 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7522,9 +7544,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= protected: c10::intrusive_ptr storage_impl_; }; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h --- pytorch-v1.5.0/c10/core/StorageImpl.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/StorageImpl.h 2021-06-25 16:37:35.602259718 +0800 ++++ pytorch-develop/c10/core/StorageImpl.h 2021-07-05 14:59:26.532337189 +0800 @@ -1,12 +1,39 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7579,9 +7601,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= bool received_cuda() { return received_cuda_; } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h --- pytorch-v1.5.0/c10/core/TensorImpl.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/TensorImpl.h 2021-06-25 16:37:35.602259718 +0800 ++++ pytorch-develop/c10/core/TensorImpl.h 2021-07-05 14:59:26.536337219 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7649,9 +7671,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= inline void set_pyobj(PyObject* pyobj) noexcept { pyobj_ = pyobj; } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h --- pytorch-v1.5.0/c10/core/TensorOptions.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/TensorOptions.h 2021-06-25 16:37:35.602259718 +0800 ++++ pytorch-develop/c10/core/TensorOptions.h 2021-07-05 14:59:26.536337219 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7690,9 +7712,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } else { AT_ASSERTM(false, "Unknown DispatchKey: ", tid); } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h --- pytorch-v1.5.0/c10/macros/Export.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/macros/Export.h 2021-06-25 16:37:35.602259718 +0800 ++++ pytorch-develop/c10/macros/Export.h 2021-07-05 14:59:26.536337219 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7726,7 +7748,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= #if defined(TORCH_HIP_BUILD_MAIN_LIB) #define TORCH_HIP_API C10_EXPORT #else -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/caffe2/.clang-format pytorch-develop/caffe2/.clang-format +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/.clang-format pytorch-develop/caffe2/.clang-format --- pytorch-v1.5.0/caffe2/.clang-format 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/caffe2/.clang-format 1970-01-01 08:00:00.000000000 +0800 @@ -1,87 +0,0 @@ @@ -7817,9 +7839,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -TabWidth: 8 -UseTab: Never -... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt --- pytorch-v1.5.0/caffe2/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/caffe2/CMakeLists.txt 2021-06-25 16:37:35.610259779 +0800 ++++ pytorch-develop/caffe2/CMakeLists.txt 2021-07-05 14:59:26.544337280 +0800 @@ -32,6 +32,7 @@ # Add source, includes, and libs to lists list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS}) @@ -7964,9 +7986,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # ---[ Caffe2 HIP sources. if(USE_ROCM) # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs. -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format --- pytorch-v1.5.0/.clang-format 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/.clang-format 2021-06-25 16:37:35.478258772 +0800 ++++ pytorch-develop/.clang-format 2021-07-05 14:59:26.412336274 +0800 @@ -84,5 +84,4 @@ SpacesInSquareBrackets: false Standard: Cpp11 @@ -7975,9 +7997,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -... +UseTab: Never \ No newline at end of file -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake --- pytorch-v1.5.0/cmake/BuildVariables.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/BuildVariables.cmake 2021-06-25 16:37:35.722260634 +0800 ++++ pytorch-develop/cmake/BuildVariables.cmake 2021-07-05 14:59:26.652338104 +0800 @@ -11,6 +11,7 @@ # CMakeLists.txt files under each folder respectively. set(Caffe2_CPU_SRCS) @@ -8002,9 +8024,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # This variable contains dependency libraries of Caffe2 which requires whole # symbol linkage. One example is the onnx lib where we need all its schema # symbols. However, if the lib is whole linked in caffe2 lib, we don't want -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake --- pytorch-v1.5.0/cmake/Codegen.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/Codegen.cmake 2021-06-25 16:37:35.722260634 +0800 ++++ pytorch-develop/cmake/Codegen.cmake 2021-07-05 14:59:26.656338135 +0800 @@ -191,13 +191,14 @@ file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp) file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp) @@ -8033,9 +8055,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET) + add_dependencies(ATEN_NPU_FILES_GEN_LIB ATEN_NPU_FILES_GEN_TARGET) endif() -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake --- pytorch-v1.5.0/cmake/Dependencies.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/Dependencies.cmake 2021-06-25 16:37:35.722260634 +0800 ++++ pytorch-develop/cmake/Dependencies.cmake 2021-07-05 14:59:26.656338135 +0800 @@ -1509,6 +1509,13 @@ ENDIF(NOT C_HAS_THREAD) endif() @@ -8050,9 +8072,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # # End ATen checks # -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake --- pytorch-v1.5.0/cmake/Summary.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/Summary.cmake 2021-06-25 16:37:35.722260634 +0800 ++++ pytorch-develop/cmake/Summary.cmake 2021-07-05 14:59:26.656338135 +0800 @@ -134,6 +134,7 @@ if(NOT "${SELECTED_OP_LIST}" STREQUAL "") message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}") @@ -8061,9 +8083,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}") message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}") endfunction() -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in --- pytorch-v1.5.0/cmake/TorchConfig.cmake.in 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/TorchConfig.cmake.in 2021-06-25 16:37:35.722260634 +0800 ++++ pytorch-develop/cmake/TorchConfig.cmake.in 2021-07-05 14:59:26.656338135 +0800 @@ -112,6 +112,11 @@ list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES}) endif() @@ -8076,9 +8098,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # When we build libtorch with the old GCC ABI, dependent libraries must too. if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@") -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt --- pytorch-v1.5.0/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/CMakeLists.txt 2021-06-25 16:37:35.482258803 +0800 ++++ pytorch-develop/CMakeLists.txt 2021-07-05 14:59:26.412336274 +0800 @@ -205,6 +205,10 @@ option(USE_TBB "Use TBB" OFF) option(ONNX_ML "Enable traditional ONNX ML API." ON) @@ -8143,9 +8165,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if (APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-private-field") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces") -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore --- pytorch-v1.5.0/.dockerignore 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/.dockerignore 2021-06-25 16:37:35.478258772 +0800 ++++ pytorch-develop/.dockerignore 2021-07-05 14:59:26.412336274 +0800 @@ -1,257 +1 @@ -# READ THIS BEFORE YOU REFACTOR ME -# @@ -8406,9 +8428,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -.clangd/ +.gitignore \ No newline at end of file -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat --- pytorch-v1.5.0/docs/make.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/docs/make.bat 2021-06-25 16:37:35.730260695 +0800 ++++ pytorch-develop/docs/make.bat 2021-07-05 14:59:26.660338165 +0800 @@ -1,36 +1,36 @@ -@ECHO OFF - @@ -8482,7 +8504,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + +:end +popd -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/ios/TestApp/.clang-format pytorch-develop/ios/TestApp/.clang-format +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/ios/TestApp/.clang-format pytorch-develop/ios/TestApp/.clang-format --- pytorch-v1.5.0/ios/TestApp/.clang-format 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/ios/TestApp/.clang-format 1970-01-01 08:00:00.000000000 +0800 @@ -1,8 +0,0 @@ @@ -8495,9 +8517,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -ColumnLimit: 100 -PointerBindsToType: false \ No newline at end of file -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt --- pytorch-v1.5.0/requirements.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/requirements.txt 2021-06-25 16:37:35.742260786 +0800 ++++ pytorch-develop/requirements.txt 2021-07-05 14:59:26.676338287 +0800 @@ -4,4 +4,12 @@ requests setuptools @@ -8514,9 +8536,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +Pillow>=5.3.0 +torchvision \ No newline at end of file -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat --- pytorch-v1.5.0/scripts/appveyor/install.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/appveyor/install.bat 2021-06-25 16:37:35.742260786 +0800 ++++ pytorch-develop/scripts/appveyor/install.bat 2021-07-05 14:59:26.676338287 +0800 @@ -1,10 +1,10 @@ -:: Installation scripts for appveyor. - @@ -8538,9 +8560,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +set PATH=C:\Miniconda-x64;C:\Miniconda-x64\Scripts;%PATH% +:: Install numpy +conda install -y numpy -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat --- pytorch-v1.5.0/scripts/appveyor/install_cuda.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/appveyor/install_cuda.bat 2021-06-25 16:37:35.742260786 +0800 ++++ pytorch-develop/scripts/appveyor/install_cuda.bat 2021-07-05 14:59:26.676338287 +0800 @@ -1,22 +1,22 @@ -@echo on - @@ -8586,9 +8608,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + +:: Make sure that nvcc is working correctly. +nvcc -V || exit /b -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat --- pytorch-v1.5.0/scripts/build_windows.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/build_windows.bat 2021-06-25 16:37:35.742260786 +0800 ++++ pytorch-develop/scripts/build_windows.bat 2021-07-05 14:59:26.676338287 +0800 @@ -1,84 +1,84 @@ -:: ############################################################################# -:: Example command to build on Windows. @@ -8758,9 +8780,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +cd %ORIGINAL_DIR% +endlocal +exit /b 1 -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1 +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1 --- pytorch-v1.5.0/scripts/proto.ps1 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/proto.ps1 2021-06-25 16:37:35.742260786 +0800 ++++ pytorch-develop/scripts/proto.ps1 2021-07-05 14:59:26.676338287 +0800 @@ -1,17 +1,17 @@ -param( - [string]$protoc, @@ -8796,9 +8818,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + +$cmd = "$protoc -I${dir} --cpp_out=$out $processed" +Invoke-Expression $cmd -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/setup.py pytorch-develop/setup.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/setup.py pytorch-develop/setup.py --- pytorch-v1.5.0/setup.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/setup.py 2021-06-25 16:37:35.742260786 +0800 ++++ pytorch-develop/setup.py 2021-07-05 14:59:26.676338287 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -8866,17 +8888,18 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= 'include/caffe2/utils/*.h', 'include/caffe2/utils/**/*.h', 'include/c10/*.h', -@@ -811,6 +838,9 @@ +@@ -811,6 +838,10 @@ 'include/c10/cuda/impl/*.h', 'include/c10/hip/*.h', 'include/c10/hip/impl/*.h', + 'include/c10/npu/*.h', ++ 'include/c10/npu/interface/*.h', + 'include/c10/npu/impl/*.h', + 'include/c10/npu/sys_ctrl/*.h', 'include/caffe2/**/*.h', 'include/torch/*.h', 'include/torch/csrc/*.h', -@@ -862,6 +892,9 @@ +@@ -862,6 +893,9 @@ 'include/THH/*.cuh', 'include/THH/*.h*', 'include/THH/generic/*.h', @@ -8886,7 +8909,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= 'share/cmake/ATen/*.cmake', 'share/cmake/Caffe2/*.cmake', 'share/cmake/Caffe2/public/*.cmake', -@@ -870,6 +903,7 @@ +@@ -870,6 +904,7 @@ 'share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/*.cmake', 'share/cmake/Gloo/*.cmake', 'share/cmake/Torch/*.cmake', @@ -8894,9 +8917,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= ], 'caffe2': [ 'python/serialized_test/data/operator_test/*.zip', -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml --- pytorch-v1.5.0/tools/autograd/derivatives.yaml 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/derivatives.yaml 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/derivatives.yaml 2021-07-05 14:59:27.812346954 +0800 @@ -107,6 +107,10 @@ # # NB: The parameter names here MUST be consistent with the parameter names @@ -8993,9 +9016,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +- name: npu_confusion_transpose(Tensor self, int[] perm, int[] shape, bool transpose_first) -> Tensor + self: npu_confusion_transpose_backward(grad, perm, self.sizes(), !transpose_first) + -+- name: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor -+ self: grad.npu_bmmV2(mat2.transpose(-2, -1)) -+ mat2: npu_bmmV2_mat2_backward(grad, self, mat2.sizes()) ++- name: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor ++ self: npu_bmm_v2_mat1_backward(grad, self, mat2, self.sizes()) ++ mat2: npu_bmm_v2_mat2_backward(grad, self, mat2, mat2.sizes()) + +- name: npu_deformable_conv2d(Tensor input, Tensor weight, Tensor offset, Tensor? bias, int[2] kernel_size, int[] stride, int[] padding, int[] dilation=[1,1,1,1], int groups=1, int deformable_groups=1, bool modulated=True) -> (Tensor, Tensor) + input, weight, offset, bias: npu_deformable_conv2dbk(input, grad, result1, weight, offset, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated) @@ -9003,9 +9026,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +- name: npu_mish(Tensor self) -> Tensor + self: npu_mish_backward(grad, self) \ No newline at end of file -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py --- pytorch-v1.5.0/tools/autograd/dump_utils.py 1970-01-01 08:00:00.000000000 +0800 -+++ pytorch-develop/tools/autograd/dump_utils.py 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/dump_utils.py 2021-07-05 14:59:27.812346954 +0800 @@ -0,0 +1,112 @@ +# Copyright (c) 2021 Huawei Technologies Co., Ltd +# All rights reserved. @@ -9119,9 +9142,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + "pin_memory", + "to_device" +] -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py --- pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/gen_autograd_functions.py 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/gen_autograd_functions.py 2021-07-05 14:59:27.812346954 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2021 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -9305,9 +9328,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= def uses_single_grad(func): return uses_ident(func, 'grad') + -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py --- pytorch-v1.5.0/tools/autograd/gen_python_functions.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/gen_python_functions.py 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/gen_python_functions.py 2021-07-05 14:59:27.816346984 +0800 @@ -1,3 +1,20 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -9347,9 +9370,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # and add to op arg map argmap['options'] = { 'value': argname, -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py --- pytorch-v1.5.0/tools/autograd/gen_variable_type.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/gen_variable_type.py 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/gen_variable_type.py 2021-07-05 14:59:27.816346984 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2021 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -9520,9 +9543,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return body -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp --- pytorch-v1.5.0/tools/autograd/templates/Functions.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/Functions.cpp 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/templates/Functions.cpp 2021-07-05 14:59:27.816346984 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2021 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -9563,25 +9586,46 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } else if (min) { return grad * (self >= *min).type_as(grad); } else if (max) { -@@ -572,6 +592,15 @@ +@@ -572,6 +592,36 @@ } } ++Tensor npu_bmm_v2_mat1_backward(const Tensor& grad, const Tensor& mat1, const Tensor& mat2, IntArrayRef sizes) { ++ // da = grad * b^T ++ auto grad_with_full_size = grad; + -+Tensor npu_bmmV2_mat2_backward(const Tensor & grad, const Tensor & mat1, IntArrayRef sizes) { -+ if (sizes.size() == 2) { -+ return mat1.reshape({-1, mat1.size(-1)}).t().mm(grad.reshape({-1, grad.size(-1)})); -+ } else { -+ return mat1.transpose(-2, -1).npu_bmmV2(grad); ++ std::vector axis_reshape(grad.sizes().begin(), grad.sizes().end()); ++ if (mat1.dim() == 1) { ++ axis_reshape.insert(axis_reshape.begin() + axis_reshape.size() - 1, 1); ++ } else if (mat2.dim() == 1) { ++ axis_reshape.insert(axis_reshape.end(), 1); ++ } ++ return grad.view(axis_reshape).npu_bmmV2(mat2.dim() == 1 ? mat2.view({1, mat2.size(0)}) : mat2.transpose(-2, -1), sizes); ++} ++ ++Tensor npu_bmm_v2_mat2_backward(const Tensor& grad, const Tensor& mat1, const Tensor& mat2, IntArrayRef sizes) { ++ // db = a^T * grad ++ auto grad_with_full_size = grad; ++ ++ std::vector axis_reshape(grad.sizes().begin(), grad.sizes().end()); ++ if (mat1.dim() == 1) { ++ axis_reshape.insert(axis_reshape.begin() + axis_reshape.size() - 1, 1); ++ } else if (mat2.dim() == 1) { ++ axis_reshape.insert(axis_reshape.end(), 1); ++ } ++ ++ if (mat1.dim() == 1) { ++ return mat1.view({mat1.size(0), 1}).npu_bmmV2(grad.view(axis_reshape), sizes); + } ++ return mat1.transpose(-2, -1).npu_bmmV2(grad.view(axis_reshape), sizes); +} + Tensor _sparse_addmm_sparse_backward(const Tensor& grad, const Tensor& sparse_, const Tensor& dense, const Scalar& alpha) { AT_ASSERT(sparse_.is_sparse()); auto sparse = sparse_.coalesce(); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp --- pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp 2021-07-05 14:59:27.816346984 +0800 @@ -22,7 +22,7 @@ #include "torch/csrc/autograd/generated/variable_factories.h" #include "torch/csrc/utils/structseq.h" @@ -9663,9 +9707,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= pybind11::gil_scoped_release no_gil; return torch::randint(low, high, size, options); } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp --- pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp 2021-07-05 14:59:27.816346984 +0800 @@ -15,7 +15,13 @@ #include "torch/csrc/cuda/Stream.h" #include "torch/csrc/cuda/Event.h" @@ -9750,9 +9794,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= {"data_ptr", (PyCFunction)THPVariable_data_ptr, METH_NOARGS, NULL}, {"dim", (PyCFunction)THPVariable_dim, METH_NOARGS, NULL}, {"has_names", (PyCFunction)THPVariable_has_names, METH_NOARGS, NULL}, -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp --- pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/VariableType.cpp 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/templates/VariableType.cpp 2021-07-05 14:59:27.816346984 +0800 @@ -1,7 +1,27 @@ +// Copyright (c) 2021 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -9781,9 +9825,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= // ${generated_comment} -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h --- pytorch-v1.5.0/tools/autograd/templates/VariableType.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/VariableType.h 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/autograd/templates/VariableType.h 2021-07-05 14:59:27.816346984 +0800 @@ -1,3 +1,20 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -9813,9 +9857,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= at::Tensor & unpack(Tensor & t, const char * name, int pos); const at::Tensor & unpack(const Tensor & t, const char * name, int pos); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl --- pytorch-v1.5.0/tools/build_variables.bzl 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/build_variables.bzl 2021-06-25 16:37:36.894269574 +0800 ++++ pytorch-develop/tools/build_variables.bzl 2021-07-05 14:59:27.816346984 +0800 @@ -46,6 +46,7 @@ "torch/csrc/autograd/functions/utils.cpp", "torch/csrc/autograd/input_buffer.cpp", @@ -9824,7 +9868,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= "torch/csrc/autograd/record_function.cpp", "torch/csrc/autograd/record_function_ops.cpp", "torch/csrc/autograd/saved_variable.cpp", -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/autograd/grad_mode.pyi pytorch-develop/torch/autograd/grad_mode.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/grad_mode.pyi pytorch-develop/torch/autograd/grad_mode.pyi --- pytorch-v1.5.0/torch/autograd/grad_mode.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/autograd/grad_mode.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,21 +0,0 @@ @@ -9849,7 +9893,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __init__(self, mode: bool) -> None: ... - def __enter__(self) -> None: ... - def __exit__(self, *args: Any) -> bool: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/autograd/__init__.pyi pytorch-develop/torch/autograd/__init__.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/__init__.pyi pytorch-develop/torch/autograd/__init__.pyi --- pytorch-v1.5.0/torch/autograd/__init__.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/autograd/__init__.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,46 +0,0 @@ @@ -9899,9 +9943,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -_TensorOrTensors = Union[Tensor, Sequence[Tensor]] -def backward(tensors: _TensorOrTensors, grad_tensors: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=...) -> None: ... -def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py --- pytorch-v1.5.0/torch/autograd/profiler.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/autograd/profiler.py 2021-06-25 16:37:36.902269635 +0800 ++++ pytorch-develop/torch/autograd/profiler.py 2021-07-05 14:59:27.820347015 +0800 @@ -1,8 +1,25 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -10372,9 +10416,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if use_cuda: append("CUDA time total: {}".format(format_time(cuda_time_total))) return ''.join(result) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt --- pytorch-v1.5.0/torch/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/CMakeLists.txt 2021-06-25 16:37:36.898269605 +0800 ++++ pytorch-develop/torch/CMakeLists.txt 2021-07-05 14:59:27.816346984 +0800 @@ -97,6 +97,7 @@ ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp ${TORCH_SRC_DIR}/csrc/utils.cpp @@ -10404,9 +10448,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if (USE_NUMPY) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NUMPY) endif() -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp --- pytorch-v1.5.0/torch/csrc/autograd/engine.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/engine.cpp 2021-06-25 16:37:36.910269696 +0800 ++++ pytorch-develop/torch/csrc/autograd/engine.cpp 2021-07-05 14:59:27.832347106 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10527,9 +10571,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= const auto default_stream = guard.getDefaultStream(leaf_stream.device()); if (leaf_stream != default_stream) { auto event = c10::Event{c10::DeviceType::CUDA}; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp --- pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp 2021-07-05 14:59:27.832347106 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10559,9 +10603,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= grad_inputs[1] = grad.to( src_options, /*non_blocking=*/false, -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp --- pytorch-v1.5.0/torch/csrc/autograd/init.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/init.cpp 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/init.cpp 2021-07-05 14:59:27.832347106 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10602,9 +10646,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= .def("shapes", &Event::shapes); m.def("_enable_profiler", enableProfiler); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp --- pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp 2021-07-05 14:59:27.832347106 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10654,9 +10698,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } auto& old_var = buffer[pos]; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp --- pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/profiler.cpp 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/profiler.cpp 2021-07-05 14:59:27.832347106 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10850,9 +10894,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } CUDAStubs::~CUDAStubs() = default; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h --- pytorch-v1.5.0/torch/csrc/autograd/profiler.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/profiler.h 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/profiler.h 2021-07-05 14:59:27.832347106 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10975,9 +11019,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= TORCH_API void pushRange(std::string name); TORCH_API void popRange(); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp --- pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/python_variable.cpp 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/python_variable.cpp 2021-07-05 14:59:27.836347137 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11029,9 +11073,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= {"is_sparse", (getter)THPVariable_is_sparse, nullptr, nullptr, nullptr}, {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr}, {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr}, -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp --- pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp 2021-07-05 14:59:27.836347137 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11070,9 +11114,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } else { value = valueToTensor(self_.options(), py_value, self_device); } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h --- pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h 2021-06-25 16:37:36.914269727 +0800 ++++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h 2021-07-05 14:59:27.836347137 +0800 @@ -168,6 +168,45 @@ return r.release(); } @@ -11119,9 +11163,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= inline PyObject* wrap(at::TensorList tl) { auto r = THPObjectPtr{PyTuple_New(tl.size())}; if (!r) throw python_error(); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp --- pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp 2021-06-25 16:37:36.910269696 +0800 ++++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp 2021-07-05 14:59:27.832347106 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11153,9 +11197,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= namespace { const Variable & checked_cast_variable(const Tensor & t, const char * name, int pos) { if (!t.defined()) { -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp --- pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp 2021-06-25 16:37:36.918269757 +0800 ++++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp 2021-07-05 14:59:27.836347137 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11259,9 +11303,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } while (!in_flight.empty()) { -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp --- pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp 2021-06-25 16:37:36.918269757 +0800 ++++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp 2021-07-05 14:59:27.836347137 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11316,9 +11360,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work") .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted) .def("is_success", &::c10d::ProcessGroup::Work::isSuccess) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp --- pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp 2021-06-25 16:37:36.918269757 +0800 ++++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp 2021-07-05 14:59:27.836347137 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11441,9 +11485,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } } } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp --- pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/DynamicTypes.cpp 2021-06-25 16:37:36.902269635 +0800 ++++ pytorch-develop/torch/csrc/DynamicTypes.cpp 2021-07-05 14:59:27.824347045 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11490,9 +11534,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= auto it = attype_to_py_storage_type.find(attype); if (it != attype_to_py_storage_type.end()) { return it->second; -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp --- pytorch-v1.5.0/torch/csrc/Generator.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/Generator.cpp 2021-06-25 16:37:36.902269635 +0800 ++++ pytorch-develop/torch/csrc/Generator.cpp 2021-07-05 14:59:27.824347045 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11558,9 +11602,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= #else TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA"); #endif -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp --- pytorch-v1.5.0/torch/csrc/generic/serialization.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/generic/serialization.cpp 2021-06-25 16:37:36.918269757 +0800 ++++ pytorch-develop/torch/csrc/generic/serialization.cpp 2021-07-05 14:59:27.840347168 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11658,9 +11702,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return storage.release(); } -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp --- pytorch-v1.5.0/torch/csrc/generic/Storage.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/generic/Storage.cpp 2021-06-25 16:37:36.918269757 +0800 ++++ pytorch-develop/torch/csrc/generic/Storage.cpp 2021-07-05 14:59:27.840347168 +0800 @@ -1,7 +1,25 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11737,9 +11781,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= THPObjectPtr item; try { for (Py_ssize_t i = 0; i < length; i++) { -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp --- pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp 2021-06-25 16:37:36.918269757 +0800 ++++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp 2021-07-05 14:59:27.840347168 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11785,9 +11829,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= {"data_ptr", (PyCFunction)THPStorage_(dataPtr), METH_NOARGS, nullptr}, {"is_pinned", (PyCFunction)THPStorage_(isPinned), METH_NOARGS, nullptr}, {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, nullptr}, -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp --- pytorch-v1.5.0/torch/csrc/Module.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/Module.cpp 2021-06-25 16:37:36.902269635 +0800 ++++ pytorch-develop/torch/csrc/Module.cpp 2021-07-05 14:59:27.824347045 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11929,9 +11973,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= #endif auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) { -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp --- pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp 2021-06-25 16:37:36.942269941 +0800 ++++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp 2021-07-05 14:59:27.860347320 +0800 @@ -1,18 +1,35 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12306,9 +12350,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -}} // namespace torch::tensors +} // namespace tensors +} // namespace torch -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp --- pytorch-v1.5.0/torch/csrc/utils/init.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/init.cpp 2021-06-25 16:37:36.942269941 +0800 ++++ pytorch-develop/torch/csrc/utils/init.cpp 2021-07-05 14:59:27.860347320 +0800 @@ -1,6 +1,10 @@ #include #include @@ -12394,9 +12438,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + } +} } // namespace torch -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h --- pytorch-v1.5.0/torch/csrc/utils/init.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/init.h 2021-06-25 16:37:36.942269941 +0800 ++++ pytorch-develop/torch/csrc/utils/init.h 2021-07-05 14:59:27.860347320 +0800 @@ -8,4 +8,7 @@ void initThroughputBenchmarkBindings(PyObject* module); @@ -12405,9 +12449,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + PyMethodDef* python_functions(); +} } // namespace torch -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h --- pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/python_arg_parser.h 2021-06-25 16:37:36.942269941 +0800 ++++ pytorch-develop/torch/csrc/utils/python_arg_parser.h 2021-07-05 14:59:27.864347350 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12440,9 +12484,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } const std::string &device_str = THPUtils_unpackString(args[i]); return at::Device(device_str); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp --- pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp 2021-06-25 16:37:36.942269941 +0800 ++++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp 2021-07-05 14:59:27.864347350 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12471,9 +12515,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= registerLayoutObject((THPLayout*)strided_layout, at::Backend::MSNPU); registerLayoutObject((THPLayout*)strided_layout, at::Backend::XLA); registerLayoutObject((THPLayout*)strided_layout, at::Backend::QuantizedCPU); -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp --- pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/tensor_new.cpp 2021-06-25 16:37:36.942269941 +0800 ++++ pytorch-develop/torch/csrc/utils/tensor_new.cpp 2021-07-05 14:59:27.864347350 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12607,9 +12651,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= " or ", c10::DispatchKey::XLATensorId, " but got: ", dispatch_key); } else if(expected_layout == c10::kSparse) { -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp --- pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/tensor_types.cpp 2021-06-25 16:37:36.942269941 +0800 ++++ pytorch-develop/torch/csrc/utils/tensor_types.cpp 2021-07-05 14:59:27.864347350 +0800 @@ -1,58 +1,91 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12775,7 +12819,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -}} // namespace torch::utils +} // namespace utils +} // namespace torch -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/cuda/__init__.pyi pytorch-develop/torch/cuda/__init__.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/cuda/__init__.pyi pytorch-develop/torch/cuda/__init__.pyi --- pytorch-v1.5.0/torch/cuda/__init__.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/cuda/__init__.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,41 +0,0 @@ @@ -12820,9 +12864,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -def reset_max_memory_cached(device: Optional[_device_t]=...) -> None: ... -def set_rng_state(new_state): ... -def get_rng_state(): ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py --- pytorch-v1.5.0/torch/distributed/distributed_c10d.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/distributed/distributed_c10d.py 2021-06-25 16:37:36.946269971 +0800 ++++ pytorch-develop/torch/distributed/distributed_c10d.py 2021-07-05 14:59:27.864347350 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -12901,9 +12945,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= else: raise RuntimeError("Unsupported distributed backend by group") -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py --- pytorch-v1.5.0/torch/distributions/von_mises.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/distributions/von_mises.py 2021-06-25 16:37:36.946269971 +0800 ++++ pytorch-develop/torch/distributions/von_mises.py 2021-07-05 14:59:27.868347381 +0800 @@ -1,140 +1,140 @@ -from __future__ import absolute_import, division, print_function - @@ -13185,9 +13229,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + """ + return 1 - (_log_modified_bessel_fn(self.concentration, order=1) - + _log_modified_bessel_fn(self.concentration, order=0)).exp() -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py --- pytorch-v1.5.0/torch/__init__.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/__init__.py 2021-06-25 16:37:36.898269605 +0800 ++++ pytorch-develop/torch/__init__.py 2021-07-05 14:59:27.816346984 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -13228,9 +13272,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +#register npu shutdown hook on exit +atexit.register(_npu_shutdown) \ No newline at end of file -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt --- pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/lib/c10d/CMakeLists.txt 2021-06-25 16:37:36.950270002 +0800 ++++ pytorch-develop/torch/lib/c10d/CMakeLists.txt 2021-07-05 14:59:27.868347381 +0800 @@ -28,6 +28,10 @@ option(USE_C10D_NCCL "USE C10D NCCL" ON) endif() @@ -13281,9 +13325,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if(USE_C10D_MPI) target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH}) copy_header(ProcessGroupMPI.hpp) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt --- pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/lib/libshm/CMakeLists.txt 2021-06-25 16:37:36.950270002 +0800 ++++ pytorch-develop/torch/lib/libshm/CMakeLists.txt 2021-07-05 14:59:27.872347411 +0800 @@ -37,8 +37,11 @@ SET_TARGET_PROPERTIES(shm PROPERTIES PREFIX "lib" @@ -13297,7 +13341,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if(UNIX AND NOT APPLE) include(CheckLibraryExists) # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830 -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/common_types.pyi pytorch-develop/torch/nn/common_types.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/common_types.pyi pytorch-develop/torch/nn/common_types.pyi --- pytorch-v1.5.0/torch/nn/common_types.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/common_types.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,37 +0,0 @@ @@ -13338,9 +13382,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -# With the proposed 'Literal' feature to Python typing, it might be possible to -# eventually eliminate this. -_maybe_indices_t = _scalar_or_tuple_2_t[Tensor] -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py --- pytorch-v1.5.0/torch/nn/functional.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/functional.py 2021-06-25 16:37:36.954270032 +0800 ++++ pytorch-develop/torch/nn/functional.py 2021-07-05 14:59:27.872347411 +0800 @@ -1611,7 +1611,7 @@ else: output = input.matmul(weight.t()) @@ -13350,7 +13394,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= ret = output return ret -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/__init__.pyi pytorch-develop/torch/nn/__init__.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/__init__.pyi pytorch-develop/torch/nn/__init__.pyi --- pytorch-v1.5.0/torch/nn/__init__.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/__init__.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,7 +0,0 @@ @@ -13361,9 +13405,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -from . import utils as utils -from . import functional as functional -from . import parallel as parallel -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py --- pytorch-v1.5.0/torch/nn/modules/batchnorm.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/batchnorm.py 2021-06-25 16:37:36.954270032 +0800 ++++ pytorch-develop/torch/nn/modules/batchnorm.py 2021-07-05 14:59:27.872347411 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -13393,9 +13437,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= else: self.register_parameter('running_mean', None) self.register_parameter('running_var', None) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py --- pytorch-v1.5.0/torch/nn/modules/module.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/module.py 2021-06-25 16:37:36.954270032 +0800 ++++ pytorch-develop/torch/nn/modules/module.py 2021-07-05 14:59:27.876347442 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -13536,9 +13580,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= def convert(t): if convert_to_format is not None and t.dim() == 4: return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py --- pytorch-v1.5.0/torch/nn/modules/normalization.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/normalization.py 2021-06-25 16:37:36.954270032 +0800 ++++ pytorch-develop/torch/nn/modules/normalization.py 2021-07-05 14:59:27.876347442 +0800 @@ -128,13 +128,14 @@ """ __constants__ = ['normalized_shape', 'eps', 'elementwise_affine'] @@ -13569,9 +13613,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= def extra_repr(self): return '{normalized_shape}, eps={eps}, ' \ -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in --- pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/transformer.pyi.in 2021-06-25 16:37:36.954270032 +0800 ++++ pytorch-develop/torch/nn/modules/transformer.pyi.in 2021-07-05 14:59:27.876347442 +0800 @@ -1,60 +1,60 @@ -from ..init import xavier_uniform_ -from .activation import MultiheadAttention @@ -13693,7 +13737,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + activation: Any = ... + def __init__(self, d_model: Any, nhead: Any, dim_feedforward: int = ..., dropout: float = ..., activation: str = ...) -> None: ... + def forward(self, tgt: Any, memory: Any, tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/common_types.pyi pytorch-develop/torch/nn/parallel/common_types.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/common_types.pyi pytorch-develop/torch/nn/parallel/common_types.pyi --- pytorch-v1.5.0/torch/nn/parallel/common_types.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parallel/common_types.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -13702,7 +13746,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -_device_t = Union[int, device] -_devices_t = Sequence[_device_t] -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/data_parallel.pyi pytorch-develop/torch/nn/parallel/data_parallel.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/data_parallel.pyi pytorch-develop/torch/nn/parallel/data_parallel.pyi --- pytorch-v1.5.0/torch/nn/parallel/data_parallel.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parallel/data_parallel.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,23 +0,0 @@ @@ -13729,9 +13773,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -def data_parallel(module: Module, inputs: Any, device_ids: Optional[_devices_t] = ..., - output_device: Optional[_device_t] = ..., dim: int = ..., - module_kwargs: Optional[Any] = ...) -> Tensor: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py --- pytorch-v1.5.0/torch/nn/parallel/distributed.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/parallel/distributed.py 2021-06-25 16:37:36.958270063 +0800 ++++ pytorch-develop/torch/nn/parallel/distributed.py 2021-07-05 14:59:27.876347442 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -13795,7 +13839,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + assert self.is_cuda or self.is_npu, "SyncBatchNorm layers only work with CUDA or NPU modules" layer._specify_ddp_gpu_num( len(self.device_ids) if self.device_ids else 1) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/distributed.pyi pytorch-develop/torch/nn/parallel/distributed.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.pyi pytorch-develop/torch/nn/parallel/distributed.pyi --- pytorch-v1.5.0/torch/nn/parallel/distributed.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parallel/distributed.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,27 +0,0 @@ @@ -13826,7 +13870,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def forward(self, *inputs: Any, **kwargs: Any) -> T_co: ... - - def __call__(self, *inputs: Any, **kwargs: Any) -> T_co: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/__init__.pyi pytorch-develop/torch/nn/parallel/__init__.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/__init__.pyi pytorch-develop/torch/nn/parallel/__init__.pyi --- pytorch-v1.5.0/torch/nn/parallel/__init__.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parallel/__init__.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -13835,7 +13879,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -from .parallel_apply import parallel_apply as parallel_apply -from .replicate import replicate as replicate -from .scatter_gather import gather as gather, scatter as scatter -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/parallel_apply.pyi pytorch-develop/torch/nn/parallel/parallel_apply.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/parallel_apply.pyi pytorch-develop/torch/nn/parallel/parallel_apply.pyi --- pytorch-v1.5.0/torch/nn/parallel/parallel_apply.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parallel/parallel_apply.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,7 +0,0 @@ @@ -13846,7 +13890,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -def parallel_apply(modules: Sequence[Module], inputs: Sequence[Any], kwargs_tup: Optional[Any] = ..., - devices: Optional[_devices_t] = ...) -> List[Any]: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/replicate.pyi pytorch-develop/torch/nn/parallel/replicate.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/replicate.pyi pytorch-develop/torch/nn/parallel/replicate.pyi --- pytorch-v1.5.0/torch/nn/parallel/replicate.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parallel/replicate.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,9 +0,0 @@ @@ -13859,7 +13903,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -def replicate(network: Module[T], devices: Union[_devices_t, Sequence[_devices_t]], detach: bool = ...) -> List[ - Module[T]]: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/scatter_gather.pyi pytorch-develop/torch/nn/parallel/scatter_gather.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/scatter_gather.pyi pytorch-develop/torch/nn/parallel/scatter_gather.pyi --- pytorch-v1.5.0/torch/nn/parallel/scatter_gather.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parallel/scatter_gather.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,24 +0,0 @@ @@ -13887,7 +13931,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - - -def gather(outputs: Any, target_device: _device_t, dim: int = ...) -> Any: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parameter.pyi pytorch-develop/torch/nn/parameter.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parameter.pyi pytorch-develop/torch/nn/parameter.pyi --- pytorch-v1.5.0/torch/nn/parameter.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/parameter.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,7 +0,0 @@ @@ -13898,7 +13942,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __init__(self, data: Tensor=..., requires_grad: builtins.bool=...): ... - - ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/clip_grad.pyi pytorch-develop/torch/nn/utils/clip_grad.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/clip_grad.pyi pytorch-develop/torch/nn/utils/clip_grad.pyi --- pytorch-v1.5.0/torch/nn/utils/clip_grad.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/utils/clip_grad.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,10 +0,0 @@ @@ -13912,7 +13956,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - - -def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float): ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/convert_parameters.pyi pytorch-develop/torch/nn/utils/convert_parameters.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/convert_parameters.pyi pytorch-develop/torch/nn/utils/convert_parameters.pyi --- pytorch-v1.5.0/torch/nn/utils/convert_parameters.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/utils/convert_parameters.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,8 +0,0 @@ @@ -13924,7 +13968,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - - -def vector_to_parameters(vec: Tensor, parameters: Iterable[Tensor]) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/__init__.pyi pytorch-develop/torch/nn/utils/__init__.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/__init__.pyi pytorch-develop/torch/nn/utils/__init__.pyi --- pytorch-v1.5.0/torch/nn/utils/__init__.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/utils/__init__.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -13933,7 +13977,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - vector_to_parameters as vector_to_parameters -from .spectral_norm import remove_spectral_norm as remove_spectral_norm, spectral_norm as spectral_norm -from .weight_norm import remove_weight_norm as remove_weight_norm, weight_norm as weight_norm -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/rnn.pyi pytorch-develop/torch/nn/utils/rnn.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/rnn.pyi pytorch-develop/torch/nn/utils/rnn.pyi --- pytorch-v1.5.0/torch/nn/utils/rnn.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/utils/rnn.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,74 +0,0 @@ @@ -14011,7 +14055,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -def get_packed_sequence(data: Tensor, batch_sizes: Optional[Tensor], sorted_indices: Optional[Tensor], - unsorted_indices: Optional[Tensor]) -> PackedSequence: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/spectral_norm.pyi pytorch-develop/torch/nn/utils/spectral_norm.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/spectral_norm.pyi pytorch-develop/torch/nn/utils/spectral_norm.pyi --- pytorch-v1.5.0/torch/nn/utils/spectral_norm.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/utils/spectral_norm.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,33 +0,0 @@ @@ -14048,7 +14092,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - - -def remove_spectral_norm(module: T_module, name: str = ...) -> T_module: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/weight_norm.pyi pytorch-develop/torch/nn/utils/weight_norm.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/weight_norm.pyi pytorch-develop/torch/nn/utils/weight_norm.pyi --- pytorch-v1.5.0/torch/nn/utils/weight_norm.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/nn/utils/weight_norm.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,28 +0,0 @@ @@ -14080,9 +14124,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - - -def remove_weight_norm(module: T_module, name: str = ...) -> T_module: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py --- pytorch-v1.5.0/torch/onnx/symbolic_opset9.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/onnx/symbolic_opset9.py 2021-06-25 16:37:36.958270063 +0800 ++++ pytorch-develop/torch/onnx/symbolic_opset9.py 2021-07-05 14:59:27.880347472 +0800 @@ -1621,14 +1621,23 @@ slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals] return g.op('Concat', *slices, axis_i=0) @@ -14140,7 +14184,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= state_indices = 2 * i, 2 * i + 2 -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adadelta.pyi pytorch-develop/torch/optim/adadelta.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adadelta.pyi pytorch-develop/torch/optim/adadelta.pyi --- pytorch-v1.5.0/torch/optim/adadelta.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/adadelta.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14149,7 +14193,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class Adadelta(Optimizer): - def __init__(self, params: _params_t, lr: float=..., rho: float=..., eps: float=..., weight_decay: float=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adagrad.pyi pytorch-develop/torch/optim/adagrad.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adagrad.pyi pytorch-develop/torch/optim/adagrad.pyi --- pytorch-v1.5.0/torch/optim/adagrad.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/adagrad.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14158,9 +14202,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class Adagrad(Optimizer): - def __init__(self, params: _params_t, lr: float=..., lr_decay: float=..., weight_decay: float=..., initial_accumulator_value: float=..., eps: float=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py --- pytorch-v1.5.0/torch/optim/adamax.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/optim/adamax.py 2021-06-25 16:37:36.958270063 +0800 ++++ pytorch-develop/torch/optim/adamax.py 2021-07-05 14:59:27.880347472 +0800 @@ -80,8 +80,8 @@ exp_inf.mul_(beta2).unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0) @@ -14172,7 +14216,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= bias_correction = 1 - beta1 ** state['step'] clr = group['lr'] / bias_correction -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adamax.pyi pytorch-develop/torch/optim/adamax.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.pyi pytorch-develop/torch/optim/adamax.pyi --- pytorch-v1.5.0/torch/optim/adamax.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/adamax.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14181,7 +14225,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class Adamax(Optimizer): - def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adam.pyi pytorch-develop/torch/optim/adam.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adam.pyi pytorch-develop/torch/optim/adam.pyi --- pytorch-v1.5.0/torch/optim/adam.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/adam.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14190,7 +14234,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class Adam(Optimizer): - def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adamw.pyi pytorch-develop/torch/optim/adamw.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamw.pyi pytorch-develop/torch/optim/adamw.pyi --- pytorch-v1.5.0/torch/optim/adamw.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/adamw.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14199,7 +14243,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class AdamW(Optimizer): - def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/asgd.pyi pytorch-develop/torch/optim/asgd.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/asgd.pyi pytorch-develop/torch/optim/asgd.pyi --- pytorch-v1.5.0/torch/optim/asgd.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/asgd.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14208,7 +14252,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class ASGD(Optimizer): - def __init__(self, params: _params_t, lr: float=..., lambd: float=..., alpha: float=..., t0: float=..., weight_decay: float=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/__init__.pyi pytorch-develop/torch/optim/__init__.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/__init__.pyi pytorch-develop/torch/optim/__init__.pyi --- pytorch-v1.5.0/torch/optim/__init__.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/__init__.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,13 +0,0 @@ @@ -14225,7 +14269,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -from .rprop import Rprop -from .sgd import SGD as SGD -from .sparse_adam import SparseAdam -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/lbfgs.pyi pytorch-develop/torch/optim/lbfgs.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/lbfgs.pyi pytorch-develop/torch/optim/lbfgs.pyi --- pytorch-v1.5.0/torch/optim/lbfgs.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/lbfgs.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14234,7 +14278,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class LBFGS(Optimizer): - def __init__(self, params: _params_t, lr: float=..., max_iter: int=..., max_eval: Optional[int]=..., tolerance_grad: float=..., tolerance_change: float=..., history_size: int=..., line_search_fn: Optional[str]=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/lr_scheduler.pyi pytorch-develop/torch/optim/lr_scheduler.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/lr_scheduler.pyi pytorch-develop/torch/optim/lr_scheduler.pyi --- pytorch-v1.5.0/torch/optim/lr_scheduler.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/lr_scheduler.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,39 +0,0 @@ @@ -14277,7 +14321,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -class CosineAnnealingWarmRestarts(_LRScheduler): - def __init__(self, optimizer: Optimizer, T_0: int=..., T_mult: int=..., eta_min: int=..., last_epoch: int=...) -> None: ... - def step(self, epoch: Optional[int] = ...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/optimizer.pyi pytorch-develop/torch/optim/optimizer.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/optimizer.pyi pytorch-develop/torch/optim/optimizer.pyi --- pytorch-v1.5.0/torch/optim/optimizer.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/optimizer.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,18 +0,0 @@ @@ -14299,7 +14343,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def zero_grad(self) -> None: ... - def step(self, closure: Optional[Callable[[], float]]=...) -> Optional[float]: ... - def add_param_group(self, param_group: dict) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/rmsprop.pyi pytorch-develop/torch/optim/rmsprop.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/rmsprop.pyi pytorch-develop/torch/optim/rmsprop.pyi --- pytorch-v1.5.0/torch/optim/rmsprop.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/rmsprop.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14308,7 +14352,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class RMSprop(Optimizer): - def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=..., centered: bool=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/rprop.pyi pytorch-develop/torch/optim/rprop.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/rprop.pyi pytorch-develop/torch/optim/rprop.pyi --- pytorch-v1.5.0/torch/optim/rprop.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/rprop.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,5 +0,0 @@ @@ -14317,7 +14361,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class Rprop(Optimizer): - def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/sgd.pyi pytorch-develop/torch/optim/sgd.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/sgd.pyi pytorch-develop/torch/optim/sgd.pyi --- pytorch-v1.5.0/torch/optim/sgd.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/sgd.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,4 +0,0 @@ @@ -14325,7 +14369,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class SGD(Optimizer): - def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/sparse_adam.pyi pytorch-develop/torch/optim/sparse_adam.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/sparse_adam.pyi pytorch-develop/torch/optim/sparse_adam.pyi --- pytorch-v1.5.0/torch/optim/sparse_adam.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/optim/sparse_adam.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,6 +0,0 @@ @@ -14335,9 +14379,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - -class SparseAdam(Optimizer): - def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=...) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py --- pytorch-v1.5.0/torch/serialization.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/serialization.py 2021-06-25 16:37:36.962270093 +0800 ++++ pytorch-develop/torch/serialization.py 2021-07-05 14:59:27.880347472 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14419,9 +14463,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= def location_tag(storage): -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py --- pytorch-v1.5.0/torch/storage.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/storage.py 2021-06-25 16:37:36.962270093 +0800 ++++ pytorch-develop/torch/storage.py 2021-07-05 14:59:27.880347472 +0800 @@ -7,6 +7,7 @@ class _StorageBase(object): @@ -14439,9 +14483,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= elif get_sharing_strategy() == 'file_system': self._share_filename_() else: -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py --- pytorch-v1.5.0/torch/tensor.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/tensor.py 2021-06-25 16:37:36.962270093 +0800 ++++ pytorch-develop/torch/tensor.py 2021-07-05 14:59:27.880347472 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14501,9 +14545,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return self def __reversed__(self): -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py --- pytorch-v1.5.0/torch/_tensor_str.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/_tensor_str.py 2021-06-25 16:37:36.898269605 +0800 ++++ pytorch-develop/torch/_tensor_str.py 2021-07-05 14:59:27.820347015 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14555,9 +14599,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= suffixes.append('device=\'' + str(self.device) + '\'') has_default_dtype = self.dtype in (torch.get_default_dtype(), torch.int64, torch.bool) -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py --- pytorch-v1.5.0/torch/utils/data/dataloader.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/utils/data/dataloader.py 2021-06-25 16:37:36.966270124 +0800 ++++ pytorch-develop/torch/utils/data/dataloader.py 2021-07-05 14:59:27.884347503 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14614,7 +14658,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= self._pin_memory_thread_done_event)) pin_memory_thread.daemon = True pin_memory_thread.start() -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/dataloader.pyi pytorch-develop/torch/utils/data/dataloader.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.pyi pytorch-develop/torch/utils/data/dataloader.pyi --- pytorch-v1.5.0/torch/utils/data/dataloader.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/utils/data/dataloader.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,44 +0,0 @@ @@ -14662,7 +14706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __len__(self) -> int: ... - def __iter__(self) -> _BaseDataLoaderIter: ... - def __next__(self) -> Any: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/dataset.pyi pytorch-develop/torch/utils/data/dataset.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataset.pyi pytorch-develop/torch/utils/data/dataset.pyi --- pytorch-v1.5.0/torch/utils/data/dataset.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/utils/data/dataset.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,32 +0,0 @@ @@ -14698,7 +14742,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __init__(self, dataset: Dataset[T_co], indices: Sequence[int]) -> None: ... - -def random_split(dataset: Dataset[T], lengths: Sequence[int]) -> List[Subset[T]]: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/distributed.pyi pytorch-develop/torch/utils/data/distributed.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/distributed.pyi pytorch-develop/torch/utils/data/distributed.pyi --- pytorch-v1.5.0/torch/utils/data/distributed.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/utils/data/distributed.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,9 +0,0 @@ @@ -14711,7 +14755,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __iter__(self) -> Iterator[int]: ... - def __len__(self) -> int: ... - def set_epoch(self, epoch: int) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/__init__.pyi pytorch-develop/torch/utils/data/__init__.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/__init__.pyi pytorch-develop/torch/utils/data/__init__.pyi --- pytorch-v1.5.0/torch/utils/data/__init__.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/utils/data/__init__.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,7 +0,0 @@ @@ -14722,7 +14766,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - Subset as Subset, random_split as random_split, IterableDataset as IterableDataset, \ - ChainDataset as ChainDataset -from .dataloader import DataLoader as DataLoader, get_worker_info as get_worker_info -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/sampler.pyi pytorch-develop/torch/utils/data/sampler.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/sampler.pyi pytorch-develop/torch/utils/data/sampler.pyi --- pytorch-v1.5.0/torch/utils/data/sampler.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/utils/data/sampler.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,38 +0,0 @@ @@ -14764,9 +14808,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - drop_last: bool - - def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None: ... -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py --- pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/utils/data/_utils/pin_memory.py 2021-06-25 16:37:36.966270124 +0800 ++++ pytorch-develop/torch/utils/data/_utils/pin_memory.py 2021-07-05 14:59:27.884347503 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14810,7 +14854,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on the # logic of this function. -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/hooks.pyi pytorch-develop/torch/utils/hooks.pyi +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/hooks.pyi pytorch-develop/torch/utils/hooks.pyi --- pytorch-v1.5.0/torch/utils/hooks.pyi 2021-04-10 18:39:32.000000000 +0800 +++ pytorch-develop/torch/utils/hooks.pyi 1970-01-01 08:00:00.000000000 +0800 @@ -1,11 +0,0 @@ @@ -14825,9 +14869,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __enter__(self): ... - def __exit__(self, type: Any, value: Any, tb: Any) -> None: ... - -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py --- pytorch-v1.5.0/torch/utils/__init__.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/utils/__init__.py 2021-06-25 16:37:36.966270124 +0800 ++++ pytorch-develop/torch/utils/__init__.py 2021-07-05 14:59:27.884347503 +0800 @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals @@ -14836,9 +14880,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Set the module for a given object for nicer printing def set_module(obj, mod): -diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py +diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py --- pytorch-v1.5.0/torch/_utils.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/_utils.py 2021-06-25 16:37:36.898269605 +0800 ++++ pytorch-develop/torch/_utils.py 2021-07-05 14:59:27.820347015 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. diff --git a/src/aten/src/ATen/native/native_functions.yaml b/src/aten/src/ATen/native/native_functions.yaml index 938496a8ec3b4cc849f5e6a96c48fe1e364c0d49..afdda6988a665d514a0694374e86b4b5c061430f 100644 --- a/src/aten/src/ATen/native/native_functions.yaml +++ b/src/aten/src/ATen/native/native_functions.yaml @@ -606,7 +606,7 @@ dispatch: CPU: bernoulli_scalar_cpu_ CUDA: bernoulli_scalar_cuda_ - supports_named_tensor: True + supports_named_tensor: True npu_dispatch: NPU: bernoulli_npu_ @@ -1040,7 +1040,7 @@ CUDA: _cosh_out_cuda npu_dispatch: NPU: cosh_out_npu - + - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full @@ -1271,7 +1271,7 @@ - func: det(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method - + - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor use_c10_dispatcher: full variants: function, method @@ -1381,7 +1381,7 @@ CUDA: embedding_renorm_cuda_ npu_dispatch: NPU: embedding_renorm_npu_ - + - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor use_c10_dispatcher: full @@ -1657,7 +1657,7 @@ SparseCPU: floor_divide_sparse SparseCUDA: floor_divide_sparse supports_named_tensor: True - npu_dispatch: + npu_dispatch: NPU: floor_divide_npu - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) @@ -1668,7 +1668,7 @@ SparseCPU: floor_divide_sparse_ SparseCUDA: floor_divide_sparse_ supports_named_tensor: True - npu_dispatch: + npu_dispatch: NPU: floor_divide_npu_ - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -1678,13 +1678,13 @@ SparseCPU: floor_divide_out_sparse_zerodim SparseCUDA: floor_divide_out_sparse_zerodim supports_named_tensor: True - npu_dispatch: + npu_dispatch: NPU: floor_divide_out_npu - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor variants: function, method supports_named_tensor: True - npu_dispatch: + npu_dispatch: NPU: floor_divide_npu - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) @@ -1793,15 +1793,15 @@ - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor npu_dispatch: NPU: hamming_window_npu - + - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor npu_dispatch: NPU: hamming_window_npu - + - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor npu_dispatch: NPU: hamming_window_npu - + - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full @@ -1995,7 +1995,7 @@ dispatch: CPU: kthvalue_out_cpu CUDA: kthvalue_out_cuda - npu_dispatch: + npu_dispatch: NPU: kthvalue_out_npu - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) @@ -2006,7 +2006,7 @@ - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) supports_named_tensor: True - npu_dispatch: + npu_dispatch: NPU: kthvalue_out_npu - func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor @@ -2090,16 +2090,22 @@ use_c10_dispatcher: full supports_named_tensor: True variants: function, method + npu_dispatch: + NPU: log10_npu - func: log10_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True variants: function, method + npu_dispatch: + NPU: log10_npu_ - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True dispatch: CPU: log10_out CUDA: log10_out + npu_dispatch: + NPU: log10_out_npu - func: log1p(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2730,14 +2736,14 @@ use_c10_dispatcher: full npu_dispatch: NPU: _pdist_forward_npu - + - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor use_c10_dispatcher: full - func: cosine_similarity(Tensor input, Tensor input2, int dim=1, float eps=1e-08) -> Tensor use_c10_dispatcher: full variants: function - + - func: permute(Tensor(a) self, int[] dims) -> Tensor(a) variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. @@ -2753,7 +2759,7 @@ - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor use_c10_dispatcher: full - + - func: is_pinned(Tensor self) -> bool use_c10_dispatcher: full variants: method @@ -2770,7 +2776,7 @@ - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor use_c10_dispatcher: full variants: function - + - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -3011,7 +3017,7 @@ CUDA: gelu_cuda npu_dispatch: NPU: gelu_npu - + - func: gelu_backward(Tensor grad, Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn @@ -3075,8 +3081,12 @@ - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor use_c10_dispatcher: full + npu_dispatch: + NPU: celu_npu - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) + npu_dispatch: + NPU: celu_npu_ - func: sigmoid(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3423,14 +3433,14 @@ npu_dispatch: NPU: prod_out_npu #NPU: prod_out_npu_ext - + - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method supports_named_tensor: True npu_dispatch: NPU: prod_npu #NPU: prod_npu_ext - + - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True npu_dispatch: @@ -3799,7 +3809,7 @@ - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor variants: function - + # VariableType::_weight_norm does not want to be given a gap in the autograd graph, # so we don't define "dispatch" variants for it. - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor @@ -5005,12 +5015,12 @@ variants: method, function npu_dispatch: NPU: bitwise_and_npu - + - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) variants: method npu_dispatch: NPU: bitwise_and_npu_ - + - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method npu_dispatch: @@ -5315,14 +5325,14 @@ CPU: legacy::cpu::_th_addbmm_ CUDA: legacy::cuda::_th_addbmm_ npu_dispatch: - NPU: addbmm_npu_ + NPU: addbmm_npu_ - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: CPU: legacy::cpu::_th_addbmm_out CUDA: legacy::cuda::_th_addbmm_out npu_dispatch: - NPU: addbmm_out_npu + NPU: addbmm_out_npu - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor use_c10_dispatcher: full @@ -5331,7 +5341,7 @@ CPU: legacy::cpu::_th_addbmm CUDA: legacy::cuda::_th_addbmm npu_dispatch: - NPU: addbmm_npu + NPU: addbmm_npu - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) variants: method @@ -5794,7 +5804,7 @@ - func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) npu_dispatch: NPU: gather_out_npu - + - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor variants: method, function npu_dispatch: @@ -6752,8 +6762,6 @@ dispatch: CPU: multilabel_margin_loss_backward_cpu_out CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out - npu_dispatch: - NPU: multilabel_margin_loss_backward_npu_out - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor use_c10_dispatcher: full @@ -6761,8 +6769,6 @@ dispatch: CPU: multilabel_margin_loss_backward_cpu CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward - npu_dispatch: - NPU: multilabel_margin_loss_backward_npu - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -7757,7 +7763,7 @@ CUDA: upsample_bicubic2d_out_cuda npu_dispatch: NPU: upsample_bicubic2d_out_npu - + - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn dispatch: @@ -7765,7 +7771,7 @@ CUDA: upsample_bicubic2d_cuda npu_dispatch: NPU: upsample_bicubic2d_npu - + - func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: @@ -7787,24 +7793,32 @@ dispatch: CPU: upsample_trilinear3d_out_cpu CUDA: upsample_trilinear3d_out_cuda + npu_dispatch: + NPU: upsample_trilinear3d_out_npu - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn dispatch: CPU: upsample_trilinear3d_cpu CUDA: upsample_trilinear3d_cuda + npu_dispatch: + NPU: upsample_trilinear3d_npu - func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: upsample_trilinear3d_backward_out_cpu CUDA: upsample_trilinear3d_backward_out_cuda + npu_dispatch: + NPU: upsample_trilinear3d_backward_out_npu - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn dispatch: CPU: upsample_trilinear3d_backward_cpu CUDA: upsample_trilinear3d_backward_cuda + npu_dispatch: + NPU: upsample_trilinear3d_backward_npu - func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -7876,6 +7890,8 @@ dispatch: CPU: upsample_nearest3d_out_cpu CUDA: upsample_nearest3d_out_cuda + npu_dispatch: + NPU: upsample_nearest3d_out_npu - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn @@ -7883,18 +7899,24 @@ CPU: upsample_nearest3d_cpu CUDA: upsample_nearest3d_cuda QuantizedCPU: quantized_upsample_nearest3d_cpu + npu_dispatch: + NPU: upsample_nearest3d_npu - func: upsample_nearest3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: upsample_nearest3d_backward_out_cpu CUDA: upsample_nearest3d_backward_out_cuda + npu_dispatch: + NPU: upsample_nearest3d_backward_out_npu - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn dispatch: CPU: upsample_nearest3d_backward_cpu CUDA: upsample_nearest3d_backward_cuda + npu_dispatch: + NPU: upsample_nearest3d_backward_npu - func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -8333,7 +8355,7 @@ npu_dispatch_only: NPU: ptiou_npu -- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor) +- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor) variants: function npu_dispatch_only: NPU: nms_with_mask_npu @@ -8406,7 +8428,7 @@ variants: function, method npu_dispatch_only: NPU: indexing_npu - + - func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, *, Tensor(a!) out) -> Tensor(a!) npu_dispatch_only: NPU: indexing_out_npu @@ -8435,7 +8457,7 @@ - func: npu_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar beta1_power, Scalar beta2_power, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, bool? use_locking, bool? use_nesterov) -> (Tensor(a!), Tensor(b!), Tensor(c!)) npu_dispatch_only: NPU: apply_adam_npu - + - func: npu_layer_norm_eval(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05) -> Tensor npu_dispatch_only: NPU: layer_norm_eval_npu @@ -8464,7 +8486,7 @@ npu_dispatch_only: NPU: confusion_transpose_backward_npu -- func: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor +- func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor variants: function, method npu_dispatch_only: NPU: bmm_v2_npu @@ -8512,14 +8534,6 @@ npu_dispatch_only: NPU: grid_assign_positive_npu -- func: global_step_inc() -> () - variants: function - use_c10_dispatcher: full - -- func: set_start_fuzz_compile_step(int step) -> () - variants: function - use_c10_dispatcher: full - - func: npu_mish_backward(Tensor grad, Tensor input) -> Tensor npu_dispatch_only: NPU: mish_backward_npu @@ -8527,4 +8541,9 @@ - func: npu_normalize_batch(Tensor self, Tensor seq_len, int normalize_type=0) -> Tensor variants: function, method npu_dispatch_only: - NPU: normalize_batch_npu \ No newline at end of file + NPU: normalize_batch_npu + +- func: npu_masked_fill_range(Tensor self, Tensor start, Tensor end, Tensor value, int axis=-1) -> Tensor + variants: function, method + npu_dispatch_only: + NPU: masked_fill_range_npu \ No newline at end of file diff --git a/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp b/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp index 2412d3ca3938353872e4529eae885b4e863af506..85c87f911eea4ff78dabb299ca184b27967a59f7 100644 --- a/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp @@ -32,13 +32,7 @@ Tensor& acos_out_npu(Tensor& result, const Tensor& self) { } Tensor acos_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU acos_out_npu(result, self); @@ -46,9 +40,7 @@ Tensor acos_npu(const Tensor& self) { } Tensor& acos_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); diff --git a/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp index 6c3e7f6abde039ef94e995bb6b3a46159ec7b0ff..921605b4810315bc9af6a73e391c35e7e849d2e6 100644 --- a/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp @@ -12,10 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -45,8 +42,7 @@ Tensor addbmm_npu( // calculate the output size auto outputSize = addbmm_npu_output_size(self, batch1, batch2, beta, alpha); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self, outputSize); // calculate the output result of the NPU addbmm_out_npu(result, self, batch1, batch2, beta, alpha); return result; @@ -58,9 +54,7 @@ Tensor& addbmm_npu_( const Tensor& batch2, Scalar beta, Scalar alpha) { - SmallVector inputs = {self, batch1, batch2}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, batch1, batch2}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = addbmm_out_npu(contiguousSelf, contiguousSelf, batch1, batch2, beta, alpha); diff --git a/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp index dd2a78db2f3793ab41ba52a3a760c4b0e2142126..3f8eec9f4bc1fcb5110b3f900ad4f1fc42ce6b21 100644 --- a/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -46,9 +45,7 @@ Tensor addcdiv_npu( Scalar value) { auto divOutputSize = broadcast_ops_npu_output_size(tensor1, tensor2); auto outputSize = broadcast_ops_npu_output_size(self.sizes(), divOutputSize); - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self, outputSize); addcdiv_out_npu(result, self, tensor1, tensor2, value); return result; @@ -59,9 +56,7 @@ Tensor& addcdiv_npu_( const Tensor& tensor1, const Tensor& tensor2, Scalar value) { - SmallVector inputs = {self, tensor1, tensor2}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, tensor1, tensor2}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = addcdiv_out_npu(contiguousSelf, contiguousSelf, tensor1, tensor2, value); diff --git a/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp index 802e3514a89827213635ffa433433b9349b057a9..14aacf1fa83747688a17467e50e5eaf4f022e0f4 100644 --- a/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp @@ -50,8 +50,7 @@ Tensor& addcmul_out_npu( OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + self, outputSize); OpPipeWithDefinedOut pipe; @@ -81,10 +80,7 @@ Tensor& addcmul_npu_( const Tensor& tensor1, const Tensor& tensor2, Scalar value) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = addcmul_out_npu_nocheck( diff --git a/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp index f10b829d6d5e143ce65efc3d82510069d90b4b65..3e11e9de5ac5307c7a8fc7ceaec6b0dfebeb132a 100644 --- a/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" +#include "ATen/native/npu/utils/OpAdapter.h" #include "ATen/native/npu/utils/NpuUtils.h" namespace at { @@ -64,14 +63,8 @@ Tensor addmv_npu( Scalar alpha) { check_1d(vec, "vec", "addmv"); - // calculate the output size auto outputSize = addmv_npu_output_size(self, mat, vec, beta, alpha); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self, outputSize); addmv_out_npu(result, self, mat, vec, beta, alpha); return result; @@ -85,9 +78,7 @@ Tensor& addmv_npu_( Scalar alpha) { check_1d(vec, "vec", "addmv"); - SmallVector inputs = {self, mat, vec}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, mat, vec}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = diff --git a/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp index 7b1fc414c09cdbaba2cf5669c62ae41bbe671dd7..864462afffdc8881e3bfbb29d643d07a29c3fe4d 100644 --- a/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" +#include "ATen/native/npu/utils/OpAdapter.h" #include "ATen/native/npu/utils/NpuUtils.h" namespace at { @@ -64,15 +63,8 @@ Tensor _addr_npu( const Tensor& vec2, Scalar beta, Scalar alpha) { - - // calculate the output size auto outputSize = addr_npu_output_size(self, vec1, vec2, beta, alpha); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self, outputSize); _addr_out_npu(result, self, vec1, vec2, beta, alpha); return result; @@ -95,9 +87,7 @@ Tensor& _addr_npu_( const Tensor& vec2, Scalar beta, Scalar alpha) { - SmallVector inputs = {self, vec1, vec2}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, vec1, vec2}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = diff --git a/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp index 5ca9cc98ab5568eb8556a3b1dc12496eeff12b46..54f08f5a109dc6fa318c5269d2dfb6414be2aaec 100644 --- a/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp @@ -35,8 +35,7 @@ Tensor& affine_grid_generator_backward_nocheck( const Tensor& grad, IntArrayRef size, bool align_corners) { - Tensor assist = at::empty_with_format( - {size[0], size[2], size[3], 3}, grad.options(), CalcuOpUtil::get_tensor_npu_format(grad)); + Tensor assist = OpPreparation::ApplyTensor(grad, {size[0], size[2], size[3], 3}); assist.select(-1, 0).copy_(_linspace_from_neg_one(grad, size[3], align_corners)); assist.select(-1, 1).copy_(_linspace_from_neg_one(grad, size[2], align_corners).unsqueeze_(-1)); assist.select(-1, 2).fill_(1); diff --git a/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp b/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp index 0b7dcc190988d89ebce7b06d2c94a28a2d18b01e..0393104b09e8b5e64af70ee0e1a402516c856206 100644 --- a/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp @@ -17,7 +17,7 @@ #include "ATen/native/npu/utils/CalcuOpUtil.h" #include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "ATen/native/npu/utils/NpuUtils.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -62,9 +62,7 @@ Tensor affine_grid_generator_npu( } // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, theta.options(), CalcuOpUtil::get_tensor_npu_format(theta)); - + Tensor result = OpPreparation::ApplyTensor(theta, outputSize); // calculate the output result of the NPU affine_grid_generator_npu_nocheck( result, diff --git a/src/aten/src/ATen/native/npu/AllKernelNpu.cpp b/src/aten/src/ATen/native/npu/AllKernelNpu.cpp index 2d9629558f8f857c4cbc1cf95c5442a75645cc0f..6723ce3709c2c9c6e80d536a2097d1e20011eb56 100644 --- a/src/aten/src/ATen/native/npu/AllKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AllKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "c10/npu/OptionsManager.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp b/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp index 16a293a53a7790a441d18237900148ebffea0b64..e6dcf1164a9a984daa636435b7b78b9da8137c6f 100644 --- a/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp index df545f701c873af7eb1c59067ec46ea7b5260a90..d6e4d3525ba33276067632519d252d83dbd6703d 100644 --- a/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp index a762c3b70dd88ffe89bedd97b90a1ed4edc11ec8..d62e8b7a5444a38e46b049135e8b26f032fcd68c 100644 --- a/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp @@ -32,10 +32,10 @@ Tensor argmin_npu(const Tensor& self, optional dim, bool keepdim) { auto outputSize = reduce_ops_npu_output_size(input, realDim, realKeepDim); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( + Tensor result = OpPreparation::ApplyTensor( outputSize, self.options().dtype(at::kInt), - CalcuOpUtil::get_tensor_npu_format(self)); + self); SmallVector DimVec = {realDim}; // calculate the output result of the NPU OpCommand cmd; diff --git a/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp index c072a639a6b2b1992451fc1d76289b49c929ca5d..5f2478cbfd1203f99a2899f4810e74fa826b9e84 100644 --- a/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp b/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp index 2561ca91eb6bd52eae3ff953ccfe0f4189a5fccf..c580971eb8b7d7b35c99a999ce094b4b1eb4587f 100644 --- a/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -32,26 +31,13 @@ Tensor& asin_out_npu( } Tensor asin_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); asin_out_npu(result, self); - return result; } Tensor& asin_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = asin_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp b/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp index add7140b0f577ad4e7aa8ffcb8e891ef9e78066c..a683f5660c0a8d9d58d82dd4f84bf3105acc960c 100644 --- a/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -44,8 +43,7 @@ Tensor& atan2_out_npu( OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + self, outputSize); atan2_out_npu_nocheck(result, self, other); @@ -54,25 +52,14 @@ Tensor& atan2_out_npu( } Tensor atan2_npu(const Tensor& self, const Tensor& other) { - // calculate the output size auto outputSize = broadcast_ops_npu_output_size(self, other); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self, outputSize); atan2_out_npu_nocheck(result, self, other); - return result; } Tensor& atan2_npu_(Tensor& self, const Tensor& other) { - SmallVector inputs = {self, other}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, other}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); diff --git a/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp b/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp index fdd9e858949a59ea06e554dd100601b681fa8a44..a8e38f109b1a515823350f9b1e2d9e3c9124c466 100644 --- a/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp @@ -31,23 +31,14 @@ Tensor& atan_out_npu(Tensor& result, const Tensor& self) { } Tensor atan_npu(const Tensor& self) { - //calculate the output size - auto outputSize = input_same_output_size(self); - - //construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); //calculate the output result of the NPU atan_out_npu(result, self); return result; } Tensor& atan_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = atan_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp b/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp index 33da1cf9d5f5d09fad5767d7d6bcd5a97db7492a..6128e55d7355b068547e69d271e9c56e28ae8f79 100644 --- a/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp @@ -28,25 +28,24 @@ std::tuple batch_nms_npu( bool change_coordinate_frame, bool transpose_box) { // construct the output tensor of the NPU - Tensor nmsed_boxes = at::empty_with_format( + Tensor nmsed_boxes = OpPreparation::ApplyTensor( {self.size(0), max_total_size, 4}, self.options().dtype(at::kHalf), - CalcuOpUtil::get_tensor_npu_format(self)); - - Tensor nmsed_scores = at::empty_with_format( + self); + Tensor nmsed_scores = OpPreparation::ApplyTensor( {self.size(0), max_total_size}, self.options().dtype(at::kHalf), - CalcuOpUtil::get_tensor_npu_format(self)); + self); - Tensor nmsed_classes = at::empty_with_format( + Tensor nmsed_classes = OpPreparation::ApplyTensor( {self.size(0), max_total_size}, self.options().dtype(at::kHalf), - CalcuOpUtil::get_tensor_npu_format(self)); + self); - Tensor nmsed_num = at::empty_with_format( + Tensor nmsed_num = OpPreparation::ApplyTensor( {self.size(0)}, self.options().dtype(at::kInt), - CalcuOpUtil::get_tensor_npu_format(self)); + self); OpCommand cmd; cmd.Name("BatchMultiClassNonMaxSuppression") diff --git a/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp b/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp index d1110053a4d5bf8a49420e70a886a8444db5ad4c..a01096cd57982596ce6166413db67531d2fc6258 100644 --- a/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -44,10 +43,7 @@ Tensor& bernoulli_out_npu(Tensor& result, const Tensor& self, const Tensor& p) { } Tensor& bernoulli_npu_(Tensor& self, double p, Generator* gen) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self}, {self}); ScalarType selfType = self.scalar_type(); Tensor selfFp32 = self; if (self.scalar_type() == ScalarType::Half) { @@ -70,10 +66,7 @@ Tensor& bernoulli_npu_(Tensor& self, double p, Generator* gen) { } Tensor& bernoulli_npu_(Tensor& self, const Tensor& p, Generator* gen) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self}, {self}); ScalarType selfType = self.scalar_type(); Tensor selfFp32 = self; Tensor pFp32 = OpPreparation::CastBackToOriFormat(p);; diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp index 30da73577e8f7e86ba0b8398ac7df06324c2a58c..692f92cea603aeb931f6cc6b19c668d8404f1e89 100644 --- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -62,13 +61,7 @@ Tensor binary_cross_entropy_backward_npu( const Tensor& target, const Tensor& weight, int64_t reduction) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor gradInput = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor gradInput = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU binary_cross_entropy_backward_out_npu( gradInput, grad_output, self, target, weight, reduction); diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp index e43b43d80c12dd7b638c31cad4a22c6d7b94b086..722dc7e2e8c874ec311083ad2b945909c3787f37 100644 --- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -68,10 +67,7 @@ Tensor binary_cross_entropy_npu( } // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self, outputSize); // calculate the output result of the NPU binary_cross_entropy_out_npu(result, self, target, weight, reduction); diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp index 1cd7fc39e695d259d16f5f3736132dd5e5ce6448..beb6f213426f61ff660f1480e5ac29f614313008 100644 --- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -28,12 +27,7 @@ Tensor binary_cross_entropy_with_logits_backward_npu( const Tensor& weight, const Tensor& pos_weight, int64_t reduction) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor gradInput = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor gradInput = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU Tensor weightTensor; diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp index 1dac3030175156b71709b8e03b85145435ec8c52..64f74aa6cdd20f2b00a1dc1161956436e3f8f7e6 100644 --- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp index b9aabebd61eec8fa64079e5a183a3f451f5521a8..13d019ec4f1ca0ea29600aff1ecbf6f1e47436db 100644 --- a/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp index 2ae899ec6971cc6d291c7b2c96c611049dc6eead..e8195e07734cb1345f53eb275fdd4a299d0a6ffc 100644 --- a/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp @@ -38,9 +38,7 @@ Tensor& bitwise_not_out_npu(Tensor& result, const Tensor& self) { OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); OpPipeWithDefinedOut pipe; return pipe.CheckMemory({self}, {result}) diff --git a/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp index 01770ace14074f8889eb3db78f66e1bf96ce2223..5110d95cf648286c126961255a9d28cd8d491cec 100644 --- a/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -44,9 +44,7 @@ Tensor& bitwise_or_out_npu( OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); bitwise_or_out_npu_nocheck(result, self, other); @@ -120,11 +118,7 @@ Tensor bitwise_or_npu(const Tensor& self, const Tensor& other) { auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - outputTensor.options(), - CalcuOpUtil::get_tensor_npu_format(outputTensor)); - + Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize); // calculate the output result of the NPU bitwise_or_out_npu_nocheck(result, self, other); @@ -132,12 +126,7 @@ Tensor bitwise_or_npu(const Tensor& self, const Tensor& other) { } Tensor bitwise_or_npu(const Tensor& self, Scalar other) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU bitwise_or_out_npu_nocheck(result, self, other); @@ -146,9 +135,7 @@ Tensor bitwise_or_npu(const Tensor& self, Scalar other) { } Tensor& bitwise_or_npu_(Tensor& self, const Tensor& other) { - SmallVector inputs = {self, other}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, other}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); diff --git a/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp index 915fd2edb31781e46c2b24d3a25981f241ec7fb5..818a660681db5b3a9053bb8a820d9977077342cb 100644 --- a/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -44,9 +44,7 @@ Tensor& bitwise_xor_out_npu( OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); bitwise_xor_out_npu_nocheck(result, self, other); @@ -122,11 +120,7 @@ Tensor bitwise_xor_npu(const Tensor& self, const Tensor& other) { auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - outputTensor.options(), - CalcuOpUtil::get_tensor_npu_format(outputTensor)); - + Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize); // calculate the output result of the NPU bitwise_xor_out_npu_nocheck(result, self, other); @@ -134,15 +128,7 @@ Tensor bitwise_xor_npu(const Tensor& self, const Tensor& other) { } Tensor bitwise_xor_npu(const Tensor& self, Scalar other) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU bitwise_xor_out_npu_nocheck(result, self, other); @@ -150,10 +136,7 @@ Tensor bitwise_xor_npu(const Tensor& self, Scalar other) { } Tensor& bitwise_xor_npu_(Tensor& self, const Tensor& other) { - SmallVector inputs = {self, other}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self, other}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = bitwise_xor_out_npu_nocheck(contiguousSelf, contiguousSelf, other); diff --git a/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp b/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp index 4001114d1c2b0eb92951271056c086dbc11548ae..77934e914f229e605ae8d8ece1afa43621c867b1 100644 --- a/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "c10/npu/OptionsManager.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp b/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp index 72a18106bb8794fbee53dde478a61725891ababf..fefd21a0666f050799a5f6ccf63552b9f7c70644 100644 --- a/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp @@ -14,75 +14,281 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { using namespace at::native::npu; +bool is_transpose_last_two_dims_v2(const Tensor& tensor) { + if (tensor.dim() < 2) { + return false; + } + auto storage_size = tensor.storage().get_npu_desc().storage_sizes_; + int64_t numel = at::prod_intlist(storage_size); + + int64_t dim1 = tensor.dim() - 1; + int64_t dim2 = tensor.dim() - 2; + + auto tensor_desc = tensor.storage().get_npu_desc(); + if (tensor_desc.base_sizes_.size() == tensor.dim() && + tensor.stride(dim2) == 1 && tensor.stride(dim1) == tensor.size(dim2) && + tensor.size(dim1) == tensor_desc.base_sizes_[dim2] && + tensor.size(dim2) == tensor_desc.base_sizes_[dim1] && + tensor.storage().size() == numel) { + return true; + } else { + return false; + } +} + SmallVector bmm_v2_output_size(const Tensor& mat1, const Tensor& mat2) { auto dim_tensor1 = mat1.dim(); auto dim_tensor2 = mat2.dim(); - TORCH_CHECK(dim_tensor1 > 2, "mat1's dim must be greater than 2"); - TORCH_CHECK(dim_tensor2 >= 2, "mat2's dim must be greater than or equal to 2"); - if (dim_tensor2 == 2) { - auto output_size(array_to_small_vector(mat1.sizes().slice(0, dim_tensor1-1))); - output_size.emplace_back(mat2.size(-1)); - return output_size; - } else { - TORCH_CHECK(dim_tensor1 == dim_tensor2, "if mat2's dim > 2, mat1's and mat2's batch size must be same"); - IntArrayRef batch_tensor1(mat1.sizes().data(), std::max(dim_tensor1 - 2, 0)); - SmallVector output_size = array_to_small_vector(batch_tensor1); - output_size.emplace_back(mat1.size(-2)); - output_size.emplace_back(mat2.size(-1)); - return output_size; + + int64_t m = dim_tensor1 == 1 ? 1 : mat1.size(-2); + int64_t n = dim_tensor2 == 1 ? 1 : mat2.size(-1); + + auto batch_a = array_to_small_vector(IntArrayRef(mat1.sizes().data(), std::max(dim_tensor1 - 2, 0))); + auto batch_b = array_to_small_vector(IntArrayRef(mat2.sizes().data(), std::max(dim_tensor2 - 2, 0))); + + batch_a.insert(batch_a.begin(), std::max(batch_a.size(), batch_b.size()) - batch_a.size(), 1); + batch_b.insert(batch_b.begin(), std::max(batch_a.size(), batch_b.size()) - batch_b.size(), 1); + + SmallVector output_size; + for (size_t i = 0; i < batch_a.size(); ++i) { + if (batch_a[i] == 1) { + output_size.emplace_back(batch_b[i]); + } else if (batch_b[i] == 1) { + output_size.emplace_back(batch_a[i]); + } else if (batch_a[i] != batch_b[i]) { + AT_ERROR("mat1 and mat2 cannot broadcast, but they are mat1 ", + mat1.sizes().data(), " mat2 ", mat2.sizes().data()); + } else { + output_size.emplace_back(batch_a[i]); + } } + output_size.emplace_back(m); + output_size.emplace_back(n); + + return output_size; } +Tensor pure_bmm_v2_npu(const Tensor& self, const Tensor& mat2, const SmallVector& output_size) { + auto tensor1 = self.dim() == 1 ? self.view({1, self.size(0)}) : self; + auto tensor2 = mat2.dim() == 1 ? mat2.view({mat2.size(0), 1}) : mat2; -Tensor bmm_v2_npu(const Tensor& self, const Tensor& mat2) { - auto outputSize = bmm_v2_output_size(self, mat2); - Tensor result; + Tensor result; - if ((self.scalar_type() == ScalarType::Float || self.scalar_type() == ScalarType::Half)) { - result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ); + if ((tensor1.scalar_type() == ScalarType::Half)) { + result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ); } else { - result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND); + result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND); } - Tensor contiguousSelf = self; - Tensor contiguousMat2 = mat2; - if(! CalcuOpUtil::is_transpose_last_two_dims(self)){ - contiguousSelf = NpuUtils::format_contiguous(self); + Tensor contiguous_self = tensor1; + Tensor contiguous_mat2 = tensor2; + bool is_self_t = is_transpose_last_two_dims_v2(tensor1); + bool is_mat2_t = is_transpose_last_two_dims_v2(tensor2); + + if(!is_self_t) { + contiguous_self = NpuUtils::format_contiguous(tensor1); } - if(! CalcuOpUtil::is_transpose_last_two_dims(mat2)){ - contiguousMat2 = NpuUtils::format_contiguous(mat2); + if(!is_mat2_t) { + contiguous_mat2 = NpuUtils::format_contiguous(tensor2); } - auto func1 = [&contiguousSelf]() { + auto func1 = [&contiguous_self]() { bool pass = false; - return std::tie(pass, contiguousSelf); + return std::tie(pass, contiguous_self); }; - auto func2 = [&contiguousMat2]() { + auto func2 = [&contiguous_mat2]() { bool pass = false; - return std::tie(pass, contiguousMat2); + return std::tie(pass, contiguous_mat2); }; - bool isSelfT = CalcuOpUtil::is_transpose_last_two_dims(self); - bool isMat2T = CalcuOpUtil::is_transpose_last_two_dims(mat2); - // executing the NPU operator OpCommand cmd; cmd.Name("BatchMatMul") .InputWithFunc(func1) .InputWithFunc(func2) .Output(result) - .Attr("adj_x1", isSelfT) - .Attr("adj_x2", isMat2T) + .Attr("adj_x1", is_self_t) + .Attr("adj_x2", is_mat2_t) .Run(); return result; } + +Tensor reshape_tensor_self(const Tensor& self, SmallVector& expect_output_size) { + // self, expect_output: [5,6,7,17], [1,6,7,65] + // self permute + reshape: [5,6,7,17] -> [6,7,5,17] -> [6,7,85] + SmallVector self_permute_idx; + SmallVector self_batch_idx; + + for (int64_t i = 0; i < self.dim(); ++i) { + if (i < self.dim() - 2) { + if (expect_output_size[i] == 1) { + self_batch_idx.emplace_back(i); + continue; + } + } else if (i == self.dim() - 1) { + for (int64_t j = 0; j < self_batch_idx.size(); ++j) { + self_permute_idx.emplace_back(self_batch_idx[j]); + } + } + self_permute_idx.emplace_back(i); + } + Tensor tmp_self = self.permute(self_permute_idx); + + int64_t m_idx = 0; + SmallVector tmp_self_size; + SmallVector tmp_self_size_low; + + m_idx = self.dim() - self_batch_idx.size() - 1; + tmp_self_size = array_to_small_vector(tmp_self.sizes()); + tmp_self_size_low.insert(tmp_self_size_low.end(), tmp_self_size.begin(), tmp_self_size.begin() + m_idx); + tmp_self_size_low.emplace_back(-1); + tmp_self = tmp_self.reshape(tmp_self_size_low); + return tmp_self; +} + +Tensor reshape_tensor_mat2(const Tensor& mat2, SmallVector& expect_output_size) { + // mat2, expect_output_size: [5,6,17,65], [1,6,7,65] + // mat2 permute + reshape: [5,6,17,65] -> [6,5,17,65] -> [6,85,65] + SmallVector mat2_permute_idx; + SmallVector mat2_batch_idx; + + for (int64_t i = 0; i < mat2.dim(); ++i) { + if (i < mat2.dim() - 2) { + if (expect_output_size[i] == 1) { + mat2_batch_idx.emplace_back(i); + continue; + } + } else if (i == mat2.dim() - 2) { + for (int64_t j = 0; j < mat2_batch_idx.size(); ++j) { + mat2_permute_idx.emplace_back(mat2_batch_idx[j]); + } + } + mat2_permute_idx.emplace_back(i); + } + Tensor tmp_mat2 = mat2.permute(mat2_permute_idx); + + int64_t k_idx = 0; + SmallVector tmp_mat2_size; + SmallVector tmp_mat2_size_low; + + k_idx = mat2.dim() - mat2_batch_idx.size() - 2; + tmp_mat2_size = array_to_small_vector(tmp_mat2.sizes()); + tmp_mat2_size_low.insert(tmp_mat2_size_low.end(), tmp_mat2_size.begin(), tmp_mat2_size.begin() + k_idx); + tmp_mat2_size_low.insert(tmp_mat2_size_low.end(), {-1, mat2.size(-1)}); + tmp_mat2 = tmp_mat2.reshape(tmp_mat2_size_low); + return tmp_mat2; +} + +SmallVector align_small_vector(SmallVector svec, + SmallVector golden_svec) { + // svec, golden: [6,7,65], [5,6,7,65] + // expect: [6,7,65] -> [1,6,7,65] + SmallVector tmp_svec; + tmp_svec = svec; + int64_t size_to_fill = golden_svec.size() - svec.size(); + if (size_to_fill > 0) { + tmp_svec.insert(tmp_svec.begin(), size_to_fill, 1); + } + return tmp_svec; +} + +void expand_tensor(Tensor& self, Tensor& mat2, SmallVector& expand_output_size) { + self = self.dim() == 1 ? self.view({1, self.size(0)}) : self; + mat2 = mat2.dim() == 1 ? mat2.view({mat2.size(0), 1}) : mat2; + int64_t m = self.size(-2); + int64_t k1 = self.size(-1); + int64_t k2 = mat2.size(-2); + int64_t n = mat2.size(-1); + + std::vector expand_batch_portion(expand_output_size.begin(), expand_output_size.end() - 2); + std::vector self_expand_size(expand_batch_portion); + std::vector mat2_expand_size(expand_batch_portion); + + self_expand_size.insert(self_expand_size.end(), {m, k1}); + mat2_expand_size.insert(mat2_expand_size.end(), {k2, n}); + + int64_t expand_batch_product = std::accumulate(expand_batch_portion.begin(), expand_batch_portion.end(), + 1L, std::multiplies()); + + std::vector self_bmm_view({expand_batch_product}); + std::vector mat2_bmm_view({expand_batch_product}); + self_bmm_view.insert(self_bmm_view.end(), {m, k1}); + mat2_bmm_view.insert(mat2_bmm_view.end(), {k2, n}); + + self = self.expand(self_expand_size).reshape(self_bmm_view); + mat2 = mat2.expand(mat2_expand_size).reshape(mat2_bmm_view); +} + +Tensor bmm_v2_npu(const Tensor& self, const Tensor& mat2, IntArrayRef output_sizes) { + auto expect_output_size = array_to_small_vector(output_sizes); + auto infer_output_size = bmm_v2_output_size(self, mat2); + Tensor tmp_self = self; + Tensor tmp_mat2 = mat2; + + // forward propagation + if (expect_output_size.empty()) { + // avoid some accuracy error caused by transdata + expand_tensor(tmp_self, tmp_mat2, infer_output_size); + expect_output_size = infer_output_size; + infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2); + + auto res = pure_bmm_v2_npu(tmp_self, tmp_mat2, infer_output_size).view(expect_output_size); + infer_output_size = expect_output_size; + + if (self.dim() == 1) { + // [k][b, k, n] -> [b, 1, n] -> [b, n] + infer_output_size.erase(infer_output_size.end() - 2); + return res.view(infer_output_size); + } else if (mat2.dim() == 1) { + // [b, m, k][k] -> [b, m, 1] -> [b, m] + infer_output_size.erase(infer_output_size.end() - 1); + return res.view(infer_output_size); + } + return res; + } + + // backward propagation + SmallVector tmp_expect_output_size = expect_output_size; + SmallVector axis_reduce; + SmallVector tmp_self_size; + SmallVector tmp_mat2_size; + + tmp_expect_output_size = align_small_vector(expect_output_size, infer_output_size); + for (int i = 0; i < tmp_expect_output_size.size(); ++i) { + if (tmp_expect_output_size[i] != infer_output_size[i]) { + axis_reduce.emplace_back(i); + } + } + + // no reduce_sum + if (axis_reduce.empty()) { + // avoid some accuracy error caused by transdata + expand_tensor(tmp_self, tmp_mat2, infer_output_size); + infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2); + return pure_bmm_v2_npu(tmp_self, tmp_mat2, infer_output_size).view(expect_output_size); + } + + // reduce sum without accuracy error + tmp_self_size = align_small_vector(array_to_small_vector(self.sizes()), infer_output_size); + tmp_mat2_size = align_small_vector(array_to_small_vector(mat2.sizes()), infer_output_size); + tmp_self = self.reshape(tmp_self_size); + tmp_mat2 = mat2.reshape(tmp_mat2_size); + tmp_self = reshape_tensor_self(tmp_self, tmp_expect_output_size); + tmp_mat2 = reshape_tensor_mat2(tmp_mat2, tmp_expect_output_size); + infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2); + // avoid some accuracy error caused by transdata + expand_tensor(tmp_self, tmp_mat2, infer_output_size); + infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2); + return pure_bmm_v2_npu(tmp_self, tmp_mat2, infer_output_size).view(expect_output_size); +} + } // namespace native } // namespace at diff --git a/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp index d75a2c65f49d8962a8793d823504dbb1e3cee104..fa2a86b0871c7d06d9199d4663ff3986736141b3 100644 --- a/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp @@ -14,9 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -59,9 +58,7 @@ Tensor bounding_box_decode_npu( double wh_ratio_clip) { SmallVector outputSize = {rois.size(0), 4}; // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, rois.options(), CalcuOpUtil::get_tensor_npu_format(rois)); - + Tensor result = OpPreparation::ApplyTensor(rois, outputSize); SmallVector means = { static_cast(means0), static_cast(means1), diff --git a/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp index aa497ce0582c4d05bdf59b90ae6ec0d6d9df3d2b..3e02aad811f780301ab953efbb7f330a3d6ecfae 100644 --- a/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp @@ -51,11 +51,7 @@ Tensor bounding_box_encode_npu( double stds2, double stds3) { // construct the output tensor of the NPU - Tensor delats = at::empty_with_format( - {anchor_box.size(0), 4}, - anchor_box.options(), - CalcuOpUtil::get_tensor_npu_format(anchor_box)); - + Tensor delats = OpPreparation::ApplyTensor(anchor_box, {anchor_box.size(0), 4}); SmallVector means = { static_cast(means0), static_cast(means1), diff --git a/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp b/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp index 60e9435f100b3703c94ea74c57b70650d3fc6b10..4d280d91394a1df67fc94cb56c033a8a1b34bc78 100644 --- a/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "c10/npu/OptionsManager.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/CastKernelNpu.cpp b/src/aten/src/ATen/native/npu/CastKernelNpu.cpp index 83eef18c41f46a8ab5c4312f3de55dc82fbe888a..09606c64483cf80891e5ba915fe7c414928d02a2 100644 --- a/src/aten/src/ATen/native/npu/CastKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CastKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp index 109f0d71d1d0cf49f6ad8d8374eda1d28dbe813f..8c3ac876475bf2928430246cb85f1ab59a121e4e 100644 --- a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp @@ -16,6 +16,7 @@ #include "c10/npu/OptionsManager.h" #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp index ba36c3baf8410813af92657fd6a18a9e817eebcf..d67326ac651275e4d3514d4ff77fb5033af1e446 100644 --- a/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp @@ -12,10 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/NpuUtils.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -81,12 +78,7 @@ Tensor _cdist_backward_npu( //Executing the NPU operator. auto outputSize = input_same_output_size(x1); - - Tensor result = at::empty_with_format( - outputSize, - tensor1_broadcast.options(), - CalcuOpUtil::get_tensor_npu_format(tensor1_broadcast)); - + Tensor result = OpPreparation::ApplyTensor(tensor1_broadcast, outputSize); OpCommand cmd; cmd.Name("CdistGrad") .Input(grad_broadcast) diff --git a/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp b/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp index 302d49e6426178896dff91c2f82e647d08aa02f4..660ac93b12581a3dd4764b6b35f01d8ccd23354c 100644 --- a/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp @@ -33,24 +33,13 @@ Tensor& ceil_out_npu(Tensor& result, const Tensor& self) { } Tensor ceil_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); ceil_out_npu(result, self); - return result; } Tensor& ceil_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = ceil_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/TraceKernelNpu.cpp b/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp similarity index 36% rename from src/aten/src/ATen/native/npu/TraceKernelNpu.cpp rename to src/aten/src/ATen/native/npu/CeluKernelNpu.cpp index d086e066aa8c511a5daeed1cd9020c94f100b7bc..fc4602ea8554f5cd851623c72283df02dddf158d 100644 --- a/src/aten/src/ATen/native/npu/TraceKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp @@ -1,4 +1,6 @@ -// Copyright (c) 2020, Huawei Technologies.All rights reserved. +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); // you may not use this file except in compliance with the License. @@ -12,39 +14,46 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { using namespace at::native::npu; -SmallVector trace_npu_input(const SmallVector& inputTensor) { - return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor); -} -SmallVector trace_npu_output(const SmallVector& outputTensor) { - return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor); +Tensor celu_out_npu_nocheck(Tensor& result, const Tensor& self, Scalar alpha) { + float alpha3 = 1.0; + OpCommand cmd; + cmd.Name("Celu") + .Input(self) + .Output(result) + .Attr("alpha1", alpha) + .Attr("alpha2", alpha) + .Attr("alpha3", alpha3) + .Run(); + return result; } -SmallVector trace_npu_attr() { - SmallVector attrs = {}; - return attrs; +Tensor celu_out_npu(Tensor& result, const Tensor& self, Scalar alpha) { + OpPipeWithDefinedOut pipe; + return pipe.CheckMemory({self}, {result}) + .Func([&self, &alpha](Tensor& result){celu_out_npu_nocheck(result, self, alpha);}) + .Call(result); } -Tensor& trace_out_npu(Tensor& result, const Tensor& self) { - auto inputs = trace_npu_input({self}); - auto outputs = trace_npu_output({result}); - auto attrs = trace_npu_attr(); - CalcuOpUtil::execute_npu_operate("Trace", inputs, outputs, attrs); +Tensor celu_npu(const Tensor& self, Scalar alpha) { + // construct the output tensor of the NPU + Tensor result = OpPreparation::ApplyTensor(self); + + // calculate the output result of the NPU + celu_out_npu(result, self, alpha); + return result; } -Tensor trace_npu(const Tensor& self) { - auto outputSize = trace_npu_output_size(self); - Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - trace_out_npu(result, self); - return result.reshape({}); -} +Tensor& celu_npu_(Tensor& self, Scalar alpha) { + celu_out_npu(self, self, alpha); + return self; } -} \ No newline at end of file + +} // namespace native +} // namespace at diff --git a/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp b/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp index 0bda2d33b4c005d8823db078b3c5c88e02391ee9..4db76efd8096ea033de19d68b123604ff85a83d1 100644 --- a/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp @@ -14,7 +14,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "ATen/native/npu/utils/NpuUtils.h" #include "ATen/native/npu/utils/OpTemplate.h" diff --git a/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp b/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp index d38894507aedf66fe45da041aa1955cc20dfc3c4..f34b4acab6e077c1a33e05520c3d9ec0984dd494 100644 --- a/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp @@ -16,8 +16,6 @@ #include #include -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" #include "ATen/native/npu/utils/OpAdapter.h" namespace at { @@ -40,14 +38,11 @@ Tensor& clamp_min_out_npu_nocheck( max = NPU_HALF_MAX; } - Tensor minTensor = CalcuOpUtil::CopyScalarToDevice(min, self.scalar_type()); - Tensor maxTensor = CalcuOpUtil::CopyScalarToDevice(max, self.scalar_type()); - OpCommand cmd; cmd.Name("ClipByValue") .Input(self) - .Input(minTensor) - .Input(maxTensor) + .Input(min, self.scalar_type()) + .Input(max, self.scalar_type()) .Output(result) .Run(); return result; @@ -60,9 +55,7 @@ Tensor& clamp_min_out_npu( OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); OpPipeWithDefinedOut pipe; return pipe.CheckMemory({self}, {result}) @@ -84,15 +77,12 @@ Tensor& clamp_max_out_npu(Tensor& result, const Tensor& self, Scalar max) { } else { min = NPU_HALF_MIN; } - - Tensor minTensor = CalcuOpUtil::CopyScalarToDevice(min, self.scalar_type()); - Tensor maxTensor = CalcuOpUtil::CopyScalarToDevice(max, self.scalar_type()); - + OpCommand cmd; cmd.Name("ClipByValue") .Input(self) - .Input(minTensor) - .Input(maxTensor) + .Input(min, self.scalar_type()) + .Input(max, self.scalar_type()) .Output(result) .Run(); return result; @@ -111,15 +101,12 @@ Tensor& clamp_out_npu_nocheck( Scalar minScalar = min.value(); clamp_min_out_npu(result, self, minScalar); - } else { - Tensor minTensor = CalcuOpUtil::CopyScalarToDevice(min.value(), self.scalar_type()); - Tensor maxTensor = CalcuOpUtil::CopyScalarToDevice(max.value(), self.scalar_type()); - + } else { OpCommand cmd; cmd.Name("ClipByValue") .Input(self) - .Input(minTensor) - .Input(maxTensor) + .Input(min.value(), self.scalar_type()) + .Input(max.value(), self.scalar_type()) .Output(result) .Run(); } @@ -135,9 +122,7 @@ Tensor& clamp_out_npu( OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); OpPipeWithDefinedOut pipe; return pipe.CheckMemory({self}, {result}) @@ -148,16 +133,8 @@ Tensor& clamp_out_npu( } Tensor clamp_min_npu(const Tensor& self, Scalar min) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); clamp_min_out_npu_nocheck(result, self, min); - return result; } @@ -168,24 +145,14 @@ Tensor& clamp_min_npu_(Tensor& self, Scalar min) { } Tensor clamp_max_npu(const Tensor& self, Scalar max) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); clamp_max_out_npu(result, self, max); return result; } Tensor& clamp_max_npu_(Tensor& self, Scalar max) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = clamp_max_out_npu(contiguousSelf, contiguousSelf, max); @@ -201,22 +168,13 @@ Tensor clamp_npu( const Tensor& self, optional min, optional max) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); clamp_out_npu_nocheck(result, self, min, max); - return result; } Tensor& clamp_npu_(Tensor& self, optional min, optional max) { clamp_out_npu(self, self, min, max); - return self; } diff --git a/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp index 205b0c5343ed25505ceb265b3393761ca58d8758..12b36826dedf724a54e7c63b83900ad218cf1aba 100644 --- a/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -36,8 +35,7 @@ Tensor confusion_transpose_npu( } // construct the output tensor of the NPU - Tensor result = at::empty_with_format(output_size, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self, output_size); OpCommand cmd; cmd.Name("ConfusionTransposeD") .Input(self) diff --git a/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp b/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp index bbef7c1b5cdb4116d54ada0e9b212c2243cbf503..0f0ea1a70715c96d0d5f02cacac78a956b24d24a 100644 --- a/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp @@ -12,72 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { using namespace at::native::npu; -SmallVector conv_tbc_npu_input( - const SmallVector& inputTensor) { - SmallVector inputTensors; - for (int i = 0; i < inputTensor.size(); i++) { - if (inputTensor[i].defined()) { - inputTensors.emplace_back(inputTensor[i]); - } - } - - return CalcuOpUtil::create_npu_input_tensor_desc(inputTensors); -} - -SmallVector conv_tbc_npu_output( - const SmallVector& outputTensor) { - return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor); -} - -SmallVector conv_tbc_npu_attr(int64_t pad) { - SmallVector paddings = {0, 0, pad, pad}; - SmallVector stridesSize = {1, 1, 1, 1}; - SmallVector dilations = {1, 1, 1, 1}; - - string dataFormat = "NCHW"; - - NPUAttrDesc npuAttrPads = NPUAttrDesc("pads", paddings); - NPUAttrDesc npuAttrStrides = NPUAttrDesc("strides", stridesSize); - NPUAttrDesc npuAttrDilations = NPUAttrDesc("dilations", dilations); - NPUAttrDesc npuAttrDataFormat = NPUAttrDesc("data_format", dataFormat); - - SmallVector attrs = { - npuAttrPads, npuAttrStrides, npuAttrDilations, npuAttrDataFormat}; - - return attrs; -} - -Tensor& conv_tbc_out_npu( - Tensor& result, - const Tensor& self, - const Tensor& weight, - const Tensor& bias, - int64_t pad) { - // constructs the input and output NPUTensorDesc - - auto inputs = conv_tbc_npu_input( - {self.transpose(0, 2).transpose(0, 1).unsqueeze(2), - weight.transpose(0, 2).unsqueeze(2), - bias}); - - auto outputs = conv_tbc_npu_output({result}); - - // constructs the attr of the NPUAttrDesc - auto attrs = conv_tbc_npu_attr(pad); - // executing the NPU operator - CalcuOpUtil::execute_npu_operate("Conv2D", inputs, outputs, attrs); - - return result; -} - Tensor conv_tbc_npu( const Tensor& self, const Tensor& weight, @@ -101,14 +41,32 @@ Tensor conv_tbc_npu( "the weight tensor (output channels)."); // calculate the output size - auto outputSize = conv_tbc_npu_output_size(self, weight, bias, pad); + int64_t Co = weight.size(2); + int64_t Wo = (self.size(0) + 2 * pad - (weight.size(0) - 1) - 1) + 1; + + SmallVector outputSize = {self.size(1), Co, 1, Wo}; // construct the output tensor of the NPU - Tensor result = - at::empty_with_format(outputSize, self.options(), ACL_FORMAT_NCHW); + Tensor result = OpPreparation::ApplyTensorWithFormat(self, outputSize, ACL_FORMAT_NCHW); + + SmallVector paddings = {0, 0, pad, pad}; + SmallVector stridesSize = {1, 1, 1, 1}; + SmallVector dilations = {1, 1, 1, 1}; - // calculate the output result of the NPU - conv_tbc_out_npu(result, self, weight, bias, pad); + Tensor self_tensor = self.transpose(0, 2).transpose(0, 1).unsqueeze(2); + Tensor weight_tensor = weight.transpose(0, 2).unsqueeze(2); + + OpCommand cmd; + cmd.Name("Conv2D") + .Input(self_tensor) + .Input(weight_tensor) + .Input(bias) + .Output(result) + .Attr("pads", paddings) + .Attr("strides", stridesSize) + .Attr("dilations", dilations) + .Attr("data_format", (string)"NCHW") + .Run(); result = result.squeeze(2).transpose(0, 2).transpose(1, 2); return result; diff --git a/src/aten/src/ATen/native/npu/CosKernelNpu.cpp b/src/aten/src/ATen/native/npu/CosKernelNpu.cpp index 835289881dbff440b70b78fa40811db5f03a9554..6874bb77e10b7a69d2578fa05864031f0ddbae74 100644 --- a/src/aten/src/ATen/native/npu/CosKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CosKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -32,26 +31,13 @@ Tensor& cos_out_npu( } Tensor cos_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); cos_out_npu(result, self); - return result; } Tensor& cos_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = cos_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp b/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp index 6167f72f268b13e34c0c8f973437a8d8a04ae260..1830cc2cae826aa042b5d2ea35a0def2ff4ca769 100644 --- a/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -32,9 +31,7 @@ Tensor& cosh_out_npu(Tensor& result, const Tensor& self) { } Tensor& cosh_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); @@ -48,13 +45,7 @@ Tensor& cosh_npu_(Tensor& self) { } Tensor cosh_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU cosh_out_npu(result, self); return result; diff --git a/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp b/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp index c9d99226ff804feebe43763bf4ccc9b3c8751087..64ddb33c2eef56fe67d91c0e66759d34b34349c0 100644 --- a/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp index 77a3b575f9f7fc56809a7ac8a90f1b041ca0b4c0..46be0dcb91449658ce223fc9cb6d268db17db7f1 100644 --- a/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -50,30 +49,29 @@ Tensor ctc_loss_backward_npu( if (logAlpha.scalar_type() == ScalarType::Half) { logAlphaNeed = logAlphaNeed.to(ScalarType::Float); } + + Tensor targetsCast = targets; + if(targets.scalar_type() == ScalarType::Long){ + targetsCast = targetsCast.to(ScalarType::Int); + } + + auto inputLengthsTensor = at::tensor(inputLengths, targetsCast.options().dtype(at::kInt)); + auto targetLengthsTensor = at::tensor(targetLengths, targetsCast.options().dtype(at::kInt)); - // IntArrayRef to Tensor - auto inputLengthsTensor = at::tensor(inputLengths, targets.options().dtype(at::kLong)); - auto targetLengthsTensor = at::tensor(targetLengths, targets.options().dtype(at::kLong)); - - // calculate the output size - auto outputSize = input_same_output_size(logProbs); + auto outputSize = {logProbs.size(1), logProbs.size(0), logProbs.size(2)}; // construct the output tensor of the NPU - Tensor grad = at::empty_with_format( - outputSize, - logProbsNeed.options(), - CalcuOpUtil::get_tensor_npu_format(logProbsNeed)); - + Tensor grad = OpPreparation::ApplyTensor(logProbsNeed, outputSize); // calculate the output result of the NPU OpCommand cmd; cmd.Name("CTCLossV2Grad") .Input(gradOutNeed) .Input(logProbsNeed) - .Input(targets) - .Input(negLogLikelihoodNeed) - .Input(logAlphaNeed) + .Input(targetsCast) .Input(inputLengthsTensor) - .Input(targetLengthsTensor) + .Input(targetLengthsTensor) + .Input(negLogLikelihoodNeed) + .Input(logAlphaNeed) .Output(grad) .Attr("blank", blank) .Attr("zero_infinity", zeroInfinity) @@ -82,8 +80,9 @@ Tensor ctc_loss_backward_npu( if (gradOut.scalar_type() == ScalarType::Half) { grad = grad.to(ScalarType::Half); } - - return grad; + + //return grad; + return grad.permute({1,0,2}); } } // namespace native } // namespace at diff --git a/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp b/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp index e048cfa2388c212aed0bcb09e22a5368e56f1aef..860a00a71cb380a1872c496f38d704afd6ffb44b 100644 --- a/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -33,12 +33,18 @@ std::tuple ctc_loss_npu( logProbsNeed = logProbsNeed.to(ScalarType::Float); } + //Aicore supports only the int type + Tensor targetsCast = targets; + if(targets.scalar_type() == ScalarType::Long){ + targetsCast = targetsCast.to(ScalarType::Int); + } + // IntArrayRef to Tensor - auto inputLengthsTensor = at::tensor(inputLengths, targets.options().dtype(at::kLong)); - auto targetLengthsTensor = at::tensor(targetLengths, targets.options().dtype(at::kLong)); + auto inputLengthsTensor = at::tensor(inputLengths, targetsCast.options()); + auto targetLengthsTensor = at::tensor(targetLengths, targetsCast.options()); // calculate the output size - auto outputSizes = ctc_loss_npu_output_size(logProbs, targets, targetLengths); + auto outputSizes = ctc_loss_npu_output_size(logProbs, targetsCast, targetLengths); // construct the output tensor of the NPU Tensor negLogLikelihood = at::empty_with_format( @@ -55,7 +61,7 @@ std::tuple ctc_loss_npu( OpCommand cmd; cmd.Name("CTCLossV2") .Input(logProbsNeed) - .Input(targets) + .Input(targetsCast) .Input(inputLengthsTensor) .Input(targetLengthsTensor) .Output(negLogLikelihood) diff --git a/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp b/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp index e7977bd3fbe0ce1a8253d24682ae62a2d78b087f..adf6401b083312dc386baa95bd611f83882f95f5 100644 --- a/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp b/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp index a50c9487e52bdeccf3837e8a80a3731fa1ed3138..62118b04432af314622917fa5f785e503a02649d 100644 --- a/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp @@ -73,8 +73,7 @@ Tensor& diag_out_npu(Tensor& result, const Tensor& self, int64_t diagonal) { OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + self, outputSize); OpPipeWithDefinedOut pipe; diff --git a/src/aten/src/ATen/native/npu/DivKernelNpu.cpp b/src/aten/src/ATen/native/npu/DivKernelNpu.cpp index 1c468502484b15042fba417cdb2154d1c8ba303a..d187d19736f19e19b2463e3ac5dd49bafce085ec 100644 --- a/src/aten/src/ATen/native/npu/DivKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/DivKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/DotKernelNpu.cpp b/src/aten/src/ATen/native/npu/DotKernelNpu.cpp index 73fb05c97bfb3efab3d27676ffd2ab1f968ef44b..9f29b4b6b6edb8948b4df5ddeee473065fc0a3dc 100644 --- a/src/aten/src/ATen/native/npu/DotKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/DotKernelNpu.cpp @@ -12,10 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -34,9 +31,8 @@ Tensor& dot_out_npu(Tensor& result, const Tensor& self, const Tensor& tensor) { return result; } Tensor dot_npu(const Tensor& self, const Tensor& tensor) { - // calculate the output size SmallVector outputSize = dot_npu_output_size(self, tensor); - Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self, outputSize); dot_out_npu(result, self, tensor); return result; } diff --git a/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp index 26ececdb639c920f71b82e8c59ee4d69ebb96a44..d7be9f5e3563b818b0a24a9595fc91bcc0cac33c 100644 --- a/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp @@ -13,9 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ATen/native/npu/utils/OpAdapter.h" #include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" namespace at { namespace native { @@ -31,12 +30,8 @@ Tensor dropout_backward_npu( TORCH_CHECK( mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype"); - auto outputSize = input_same_output_size(grad_output); double retain = 1. - scale; - Tensor result = at::empty_with_format( - outputSize, - grad_output.options(), - CalcuOpUtil::get_tensor_npu_format(grad_output)); + Tensor result = OpPreparation::ApplyTensor(grad_output); Tensor prob = CalcuOpUtil::CopyScalarToDevice(retain, grad_output.scalar_type()); diff --git a/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp index b82e7728d7eb67e2c91f5c08179f6e378e665cd1..34c41be07f188d2ae1ed3400ebb97b6e1aa6d926 100644 --- a/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp @@ -13,8 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -53,9 +52,7 @@ Tensor dropout_v2_backward_npu(const Tensor& grad_output, const Tensor& mask, do if (maskCopy.scalar_type() == ScalarType::Byte){ maskCopy = maskCopy.to(ScalarType::Half); } - auto outputSize = input_same_output_size(grad_output); - auto result = at::empty_with_format( - outputSize, grad_output.options(), CalcuOpUtil::get_tensor_npu_format(grad_output)); + auto result = OpPreparation::ApplyTensor(grad_output); dropout_v2_backward_out_npu(result, grad_output, maskCopy, p); return result; diff --git a/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp index 18b7883787277ce8e2ef968be386f41e5f0d44b0..e4a435e77a4c8f9d479ae5a57b6ff43f2fa01eea 100644 --- a/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -46,11 +45,8 @@ tuple dropout_v2_npu(const Tensor& self, Tensor& seed, Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self); Tensor formatCastOfSeed = OpPreparation::CastBackToOriFormat(seed); - Tensor result = at::empty_with_format( - formatCastOfSelf.sizes(), formatCastOfSelf.options(), CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf)); - Tensor mask = at::empty_with_format( - formatCastOfSelf.sizes(), formatCastOfSeed.options(), CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf)); - + Tensor result = OpPreparation::ApplyTensor(formatCastOfSelf); + Tensor mask = OpPreparation::ApplyTensor(formatCastOfSelf, formatCastOfSeed.options()); dropout_v2_out_npu(result, mask, formatCastOfSeed, formatCastOfSelf, formatCastOfSeed, p); NpuUtils::format_fresh_view(seed, formatCastOfSeed); return std::tuple(result, mask, seed); diff --git a/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp b/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp index 3b0e3886b990e627ab2c5a8935f44d731809982f..b25e4f43b8e6d763dd55c76126e6b6ff86317f36 100644 --- a/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "c10/npu/OptionsManager.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/EqKernelNpu.cpp b/src/aten/src/ATen/native/npu/EqKernelNpu.cpp index 89c567438c2b9cc94816a6aa61b28c229e05edee..becfb6f46da8c924b65a3c8c334ec950d1b37c58 100644 --- a/src/aten/src/ATen/native/npu/EqKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/EqKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp b/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp index ac1bbf17e73a7ed8730f525e47ff144fa89c6867..a773eee31289a6b306027804c9d3be8a8b432920 100644 --- a/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp @@ -34,9 +34,7 @@ Tensor& exp_out_npu(Tensor& result, const Tensor& self) { OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); OpPipeWithDefinedOut pipe; return pipe.CheckMemory({self}, {result}) @@ -51,11 +49,7 @@ Tensor& exp_npu_(Tensor& self) { } Tensor exp_npu(const Tensor& self) { - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); exp_out_npu_nocheck(result, self); return result; } diff --git a/src/aten/src/ATen/native/npu/FillKernelNpu.cpp b/src/aten/src/ATen/native/npu/FillKernelNpu.cpp index f22bd1028c6071080e67fbefd618608026704a7d..0393a62c8447154335e80f3f9605a74f820882b3 100644 --- a/src/aten/src/ATen/native/npu/FillKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/FillKernelNpu.cpp @@ -44,12 +44,11 @@ Tensor& fill_out_npu(Tensor& result, Tensor& self, const Tensor& other) { Tensor& fills_out_npu(Tensor& result, Tensor& self, Scalar value) { AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "fills_out_npu", [&]() { auto value_converted = value.to();}); - float scalar = CalcuOpUtil::get_scalar_float_value(value); OpCommand cmd; cmd.Name("Fills") .Input(self) .Output(result) - .Attr("value", scalar) + .Attr("value", value) .Run(); return result; diff --git a/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp b/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp index 38635a9480ce3bdaf3539b237a32f32714e7c904..ab6b6a2c4cb21571603977d5651ea2d42355e4a3 100644 --- a/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "c10/npu/OptionsManager.h" #include "ATen/native/npu/utils/OpAdapter.h" @@ -21,12 +20,7 @@ namespace native { using namespace at::native::npu; Tensor flip_npu(const Tensor& self, IntArrayRef dims){ - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); SmallVector dimVec = array_to_small_vector(dims); if (!c10::npu::OptionsManager::CheckDynamicEnable()) { OpCommand cmd; diff --git a/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp b/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp index c0cfee496dfb5b1fb3fb4b56bfd7eb95e7ed3bf2..868f3c08f7ca4b99e40f6bfe510e263dce3c55fe 100644 --- a/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp b/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp index 07fc34e6975b5d9a81e0ed2799e4ca693f994b28..bfc5ac5f3cf1b0bdcb0d4fcf372f7e0780812411 100644 --- a/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp @@ -34,9 +34,7 @@ Tensor& floor_out_npu(Tensor& result, const Tensor& self) { OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); OpPipeWithDefinedOut pipe; return pipe.CheckMemory({self}, {result}) @@ -51,11 +49,7 @@ Tensor& floor_npu_(Tensor& self) { } Tensor floor_npu(const Tensor& self) { - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); floor_out_npu_nocheck(result, self); return result; } diff --git a/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp b/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp index 1ee4b82bd8c2e11aa06c0cff262c0831413765cf..ed29f29842bae495592daf5b596ccd2dee4d7138 100644 --- a/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -49,9 +48,8 @@ Tensor& fmod_out_npu(Tensor& result, const Tensor& self, const Tensor& other) { auto outputSize = broadcast_ops_npu_output_size(self, other); OpPreparation::CheckOut( {self, other}, - result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + result, + self, outputSize); fmod_out_npu_nocheck(result, self, other); @@ -65,10 +63,7 @@ Tensor& fmod_out_npu(Tensor& result, const Tensor& self, Scalar other) { } Tensor& fmod_npu_(Tensor& self, Scalar other) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = fmod_out_npu_nocheck(contiguousSelf, contiguousSelf, other); @@ -81,10 +76,7 @@ Tensor& fmod_npu_(Tensor& self, Scalar other) { } Tensor& fmod_npu_(Tensor& self, const Tensor& other) { - SmallVector inputs = {self, other}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self, other}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = fmod_out_npu_nocheck(contiguousSelf, contiguousSelf, other); @@ -97,26 +89,14 @@ Tensor& fmod_npu_(Tensor& self, const Tensor& other) { } Tensor fmod_npu(const Tensor& self, Scalar other) { - // calculate the output size - auto outputSize = input_same_output_size(self); - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); fmod_out_npu_nocheck(result, self, other); return result; } Tensor fmod_npu(const Tensor& self, const Tensor& other) { - // calculate the output size auto outputSize = broadcast_ops_npu_output_size(self, other); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self, outputSize); fmod_out_npu_nocheck(result, self, other); return result; } diff --git a/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp b/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp index 3d6d2dc560ec858f14cfddeb079a327ccbe0a445..6034cb27a981c5d6b592fe3ac179cb61b57f5807 100644 --- a/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/GeKernelNpu.cpp b/src/aten/src/ATen/native/npu/GeKernelNpu.cpp index c0f5189e30ef381ec3401d17eb1361766ccc229e..6169b9de059eb470ed71172330301d88ea77d319 100644 --- a/src/aten/src/ATen/native/npu/GeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GeKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp b/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp index 77ad4b6e29a9f1b39c0c8030ae99dd03cf83906f..b736e79d6cd0bad59fe3f1d475972249ebc40630 100644 --- a/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp @@ -11,18 +11,16 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" + + +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { using namespace at::native::npu; Tensor gelu_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - // construct the output tensor of the NPU - Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU OpCommand cmd; cmd.Name("Gelu") diff --git a/src/aten/src/ATen/native/npu/GerKernelNpu.cpp b/src/aten/src/ATen/native/npu/GerKernelNpu.cpp index 7403be29cd0f5eb8c3b7a76dc88561ad050f4223..8651b3e70493d9ddc86a869f2deb45154dde1f05 100644 --- a/src/aten/src/ATen/native/npu/GerKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GerKernelNpu.cpp @@ -54,8 +54,7 @@ Tensor& ger_out_npu(Tensor& result, const Tensor& self , const Tensor& vec2) { OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + self, outputSize); OpPipeWithDefinedOut pipe; diff --git a/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp index 80f7c17f6359ce1ce728da6a30e23d055e4d38a1..4d71349db4e0cb69646d14b2abaef55275ad1b1b 100644 --- a/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp @@ -15,7 +15,6 @@ // limitations under the License. #include -#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "ATen/native/npu/utils/OpAdapter.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp index 10e53e0a3c57d8aa3e0acbb7099c64a954019261..75461018566a1c01580f3acc6804fe65e8921ce8 100644 --- a/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp @@ -15,7 +15,6 @@ // limitations under the License. #include -#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "ATen/native/npu/utils/OpAdapter.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp index aa9c7394bb5540ea513008dd866954b03c6f90db..b9d0bfe9247169441b616fd38a1cb451cf6aa41b 100644 --- a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp index 465928a0dd3bb4268310749e7abd19eb5ecce275..070af9fe52fd674968172abb510b8fd5676b7932 100644 --- a/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp @@ -39,10 +39,7 @@ Tensor hardsigmoid_npu(const Tensor& self) { } Tensor& hardsigmoid_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = hardsigmoid_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp index 20cd8fdb1bf9389350b410f12307d8b451f4bd7a..786e53c2dc5edc29e6b6c09bce3cad0852b62dc2 100644 --- a/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp @@ -27,15 +27,13 @@ Tensor& hardtanh_backward_out_npu( const Tensor& self, Scalar min_val, Scalar max_val) { - float max_value = CalcuOpUtil::get_scalar_float_value(max_val); - float min_value = CalcuOpUtil::get_scalar_float_value(min_val); OpCommand cmd; cmd.Name("HardtanhGrad") .Input(self) .Input(grad_output) .Output(grad_input) - .Attr("max_val", max_value) - .Attr("min_val", min_value) + .Attr("max_val", max_val) + .Attr("min_val", min_val) .Run(); return grad_input; @@ -46,13 +44,7 @@ Tensor hardtanh_backward_npu( const Tensor& self, Scalar min_val, Scalar max_val) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor grad_input = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor grad_input = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU hardtanh_backward_out_npu(grad_input, grad_output, self, min_val, max_val); diff --git a/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp index 90103032b7d18400320eef85d0a6220e58b3cc27..6686c026c6a54380b459be8b2d5a24650fbae327 100644 --- a/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -38,24 +37,13 @@ Tensor& hardtanh_out_npu( } Tensor hardtanh_npu(const Tensor& self, Scalar min, Scalar max) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); hardtanh_out_npu(result, self, min, max); - return result; } Tensor& hardtanh_npu_(Tensor& self, Scalar min, Scalar max) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = hardtanh_out_npu(contiguousSelf, contiguousSelf, min, max); diff --git a/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp b/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp index 5434ec7cddf335301d4f54298306d81a3b9206cf..5b3825428b15a9244d97f9005a1f1c62f4ef5911 100644 --- a/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp @@ -14,9 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp index bfa2f335330d583bdba8246e14f1d6ad7ebaf02b..c0a96311251e930e14be19ea74610e6a8f22671a 100644 --- a/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp @@ -71,8 +71,7 @@ Tensor& im2col_backward_out_npu( OpPreparation::CheckOut( {grad_output}, grad_input, - CalcuOpUtil::get_tensor_npu_format(grad_output), - grad_output.scalar_type(), + grad_output, outputSize); OpPipeWithDefinedOut pipe; diff --git a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp index eaa3d74bf562e8644fac716b996e9f7cb48e7da0..d658eec1deb854ba4fced8595f3c8e9648b1bcef 100644 --- a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp @@ -124,8 +124,7 @@ Tensor& im2col_out_npu(Tensor& result, const Tensor &self, IntArrayRef kernel_si OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + self, image_to_col_npu_output_size(self, kernel_size, stride, dilation, padding)); OpPipeWithDefinedOut pipe; @@ -139,10 +138,7 @@ Tensor im2col_npu(const Tensor &self, IntArrayRef kernel_size, IntArrayRef dilat // calculate the output size auto outputSize = image_to_col_npu_output_size(self, kernel_size, stride, dilation, padding); - - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self, outputSize); im2col_out_npu(result, self, kernel_size, dilation, padding, stride); return result; diff --git a/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp index 93bb88780acda59938c2b9ba211e153d2f81404b..803b344e2e71911234c88ab2b28758c058ef559f 100644 --- a/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include namespace at { namespace native { @@ -51,10 +52,7 @@ Tensor& index_add_npu_( int64_t dim, const Tensor& index, const Tensor& source) { - SmallVector inputs = {self, index, source}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self, index, source}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = index_add_out_npu(contiguousSelf, contiguousSelf, dim, index, source); diff --git a/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp index a5ef53dec31dd29bb3469517eef25b5b9682efe9..0a9db345f5160c6e677bea6a41a6c4a69cd44ced 100644 --- a/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include namespace at{ diff --git a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp index 04ea6edcb722b8b0ef06f5727d4a084f4ca126cc..9cbbf8f8416d6117e5b701cab3bb973e1c8b3fc8 100644 --- a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp @@ -14,6 +14,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -37,10 +38,9 @@ Tensor& index_put_nocheck( } } - Tensor masksTensor = CalcuOpUtil::copy_tensor_host_to_device( + auto masksTensor = CalcuOpUtil::copy_tensor_host_to_device( from_blob(masks.data(), {masks.size()}, dtype(ScalarType::Long))); - OpCommand cmd; cmd.Name("IndexPut") .Input(self) @@ -78,10 +78,7 @@ Tensor& _index_put_impl_npu_( const Tensor& value, const bool accumulate, const bool unsafe) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); OpPreparation::CastBackToOriFormat(self); Tensor valueCopy = value; diff --git a/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp index 9cd54424d690c3c5132cf4bb89ea1c2d5348fa42..277ed1c8a2f2ca00b8ba7409ed30990a4e4cb4a2 100644 --- a/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "c10/npu/OptionsManager.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp index da7b15df158c77b72dc70ee5d9f90e72f627e3a5..864a096f73936ea35bd14e979ff85f69e917b12b 100644 --- a/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -56,9 +55,7 @@ Tensor indexing_npu( outputSize.emplace_back((end[i] + strides[i] - 1 - begin[i]) / strides[i]); } // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self, outputSize); indexing_out_npu(result, self, begin, end, strides); return result; diff --git a/src/aten/src/ATen/native/npu/InstanceNormKernelNpu.cpp b/src/aten/src/ATen/native/npu/InstanceNormKernelNpu.cpp deleted file mode 100644 index f61cc8982c6733b1837a9673deda3eb9f9741c6c..0000000000000000000000000000000000000000 --- a/src/aten/src/ATen/native/npu/InstanceNormKernelNpu.cpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2020, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" - -namespace at { -namespace native { -using namespace at::native::npu; - -SmallVector instance_norm_npu_input(const SmallVector& inputTensor) { - return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor); -} - -SmallVector instance_norm_npu_output(const Tensor& result) { - return CalcuOpUtil::create_npu_output_tensor_desc({result}); -} - -SmallVector instance_norm_npu_attr(bool use_input_stats, double momentum, double eps) { - NPUAttrDesc npuAttrStats = NPUAttrDesc("use_input_stats", use_input_stats); - NPUAttrDesc npuAttrMomentum = NPUAttrDesc("momentum", static_cast(momentum)); - NPUAttrDesc npuAttrEpsilon = NPUAttrDesc("eps", static_cast(eps)); - SmallVector attrs = {npuAttrStats, npuAttrMomentum, npuAttrEpsilon}; - return attrs; -} - -Tensor& instance_norm_out_npu( - Tensor& result, - const Tensor& self, - const Tensor& weight, - const Tensor& bias, - const Tensor& running_mean, - const Tensor& running_var, - bool use_input_stats, - double momentum, - double eps) { - // constructs the input and output NPUTensorDesc - auto inputs = instance_norm_npu_input( - {self, weight, bias, running_mean, running_var}); - auto outputs = instance_norm_npu_output(result); - - // constructs the attr of the NPUAttrDesc - auto attrs = instance_norm_npu_attr(use_input_stats, momentum, eps); - - // executing the NPU operator - CalcuOpUtil::execute_npu_operate("InstanceNorm", inputs, outputs, attrs); - - return result; -} - -Tensor instance_norm_npu( - const Tensor& self, - const Tensor& weight, - const Tensor& bias, - const Tensor& running_mean, - const Tensor& running_var, - bool use_input_stats, - double momentum, - double eps, - bool cudnn_enabled) { - TORCH_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()), - "Expected running_mean and running_var to be defined when use_input_stats is false"); - Tensor result = at::empty_with_format(self.sizes(), self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU - instance_norm_out_npu( - result, - self, - weight, - bias, - running_mean, - running_var, - use_input_stats, - momentum, - eps); - - return result; -} - -} // namespace native -} // namespace at diff --git a/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp b/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp index b00ab6091b2e4c9267a9446d93d6644b0dfec23b..c25c31599b68bcf1ea88fc5227561448cf10f19f 100644 --- a/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -36,8 +35,7 @@ Tensor& inverse_out_npu( } Tensor inverse_npu(const Tensor& self) { - Tensor result = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self); inverse_out_npu(result, self); diff --git a/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp index 41e5f286e7983bb681e62d57a0eeb3c11e0ed8a5..41ed93cd7185c012e50ecef2c67a54cc0c2ccd71 100644 --- a/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp b/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp index f017aa6d23cc0162f5d7d761625ca5c45c0a6487..0e4ca4176dc8163513eb7a99f2b99fda1d35c823 100644 --- a/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/LeKernelNpu.cpp b/src/aten/src/ATen/native/npu/LeKernelNpu.cpp index a4e925d112cbe025675502e570db3bc804dff9a2..88ad478bd8b658c5f503e481e51b6c5823a430d3 100644 --- a/src/aten/src/ATen/native/npu/LeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LeKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp index 6625add2c27f85f968a79f645028e4e6b9a1c334..7ae583aea73d60518099dd32c479071341835a7e 100644 --- a/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -27,13 +26,12 @@ Tensor leaky_relu_backward_out_npu( const Tensor& self, Scalar negval, bool is_result) { - float negvalValue = CalcuOpUtil::get_scalar_float_value(negval); OpCommand cmd; cmd.Name("LeakyReluGrad") .Input(grad_output) .Input(self) .Output(result) - .Attr("negative_slope", negvalValue) + .Attr("negative_slope", negval) .Run(); return result; } @@ -43,14 +41,7 @@ Tensor leaky_relu_backward_npu( const Tensor& self, Scalar negval, bool is_result) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); leaky_relu_backward_out_npu(result, grad_output, self, negval, is_result); return result; } diff --git a/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp b/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp index 4e04514bcc626259cb4079ada721f64ddf5a191d..ff1f6ed40fec4864d87982c3fbfbe4f70045e543 100644 --- a/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -33,23 +32,14 @@ Tensor& leaky_relu_out_npu(Tensor& result, const Tensor& self, Scalar negval) { } Tensor leaky_relu_npu(const Tensor& self, Scalar negval) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU leaky_relu_out_npu(result, self, negval); return result; } Tensor& leaky_relu_npu_(Tensor& self, Scalar neg_val) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = leaky_relu_out_npu(contiguousSelf, contiguousSelf, neg_val); diff --git a/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp index a9fb4097c339451e775e960535ffedecbd4f15f1..13ca2ae5ac76c4b465c5c88533b2b48e24bb28c7 100644 --- a/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/Log10KernelNpu.cpp b/src/aten/src/ATen/native/npu/Log10KernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fa73df052424dc54f6d16b0fe709f3309dae751 --- /dev/null +++ b/src/aten/src/ATen/native/npu/Log10KernelNpu.cpp @@ -0,0 +1,65 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ATen/native/npu/utils/OpAdapter.h" + +namespace at { +namespace native { +using namespace at::native::npu; + +Tensor& log10_out_npu_nocheck(Tensor& result, const Tensor& self) { + OpCommand cmd; + cmd.Name("Log") + .Input(self) + .Output(result) + .Attr("base", (float)10.0) + .Attr("scale", (float)1.0) + .Attr("shift", (float)0.0) + .Run(); + + return result; +} + +Tensor& log10_out_npu(Tensor& result, const Tensor& self) { + OpPreparation::CheckOut( + {self}, + result, + self); + + OpPipeWithDefinedOut pipe; + return pipe.CheckMemory({self}, {result}) + .Func([&self](Tensor& result){log10_out_npu_nocheck(result, self);}) + .Call(result); +} + +Tensor log10_npu(const Tensor& self) { + // construct the output tensor of the NPU + Tensor result = OpPreparation::ApplyTensor(self); + + // calculate the output result of the NPU + log10_out_npu_nocheck(result, self); + + return result; +} + +Tensor& log10_npu_(Tensor& self) { + log10_out_npu(self, self); + + return self; +} + +} // namespace native +} // namespace at diff --git a/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp b/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp index 8c44547f7b53a7d76b73371fb6ee87f553d5cd78..57e9a2b45ff24774034142037d13d56df4602389 100644 --- a/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp @@ -29,13 +29,7 @@ Tensor& log1p_out_npu(Tensor& result, const Tensor& self){ } Tensor log1p_npu(const Tensor& self) { - //calculate the output size - auto outputSize = input_same_output_size(self); - - //construct the output tensor of the NPU - Tensor result = at::empty_with_format(outputSize, - self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); //calculate the output result of the NPU log1p_out_npu(result, self); @@ -43,9 +37,7 @@ Tensor log1p_npu(const Tensor& self) { } Tensor& log1p_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); diff --git a/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp b/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp index 6eff7033e140e362b1cef373807fad41a6911656..11e83eb11734a933038369ddd9f398d4ee59ae5a 100644 --- a/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -35,24 +34,13 @@ Tensor& log2_out_npu(Tensor& result, const Tensor& self) { } Tensor log2_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); log2_out_npu(result, self); - return result; } Tensor& log2_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = log2_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/LogKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogKernelNpu.cpp index 1547cc6314cb696b920573176f56c4405c4566e2..89409d80f542257f85c859cf35e24fea1806a7e9 100644 --- a/src/aten/src/ATen/native/npu/LogKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LogKernelNpu.cpp @@ -21,16 +21,13 @@ namespace native { using namespace at::native::npu; Tensor& log_out_npu_nocheck(Tensor& result, const Tensor& self) { - float baseValue = CalcuOpUtil::get_scalar_float_value(-1); - float scaleValue = CalcuOpUtil::get_scalar_float_value(1); - float shiftValue = CalcuOpUtil::get_scalar_float_value(0); OpCommand cmd; cmd.Name("Log") .Input(self) .Output(result) - .Attr("base", baseValue) - .Attr("scale", scaleValue) - .Attr("shift", shiftValue) + .Attr("base", (float)-1) + .Attr("scale", (float)1) + .Attr("shift", (float)0) .Run(); return result; @@ -40,9 +37,7 @@ Tensor& log_out_npu(Tensor& result, const Tensor& self) { OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); OpPipeWithDefinedOut pipe; return pipe.CheckMemory({self}, {result}) diff --git a/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp index 391feef2dc201f06fc6fc48f149057d431c6a837..71373bffaf7a09bac16c63561677d8e025d56e73 100644 --- a/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -33,11 +32,8 @@ tuple log_sigmoid_forward_out_npu( } tuple log_sigmoid_forward_npu(const Tensor& self) { - // construct the output tensor of the NPU - Tensor output = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor output = OpPreparation::ApplyTensor(self); Tensor buffer = at::empty({0}, self.options()); - // calculate the output result of the NPU log_sigmoid_forward_out_npu(output, buffer, self); return tuple(output, buffer); diff --git a/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp index e52d27c904414c876be3f91fc60d2bd52ac3570e..95b65df6c9d9f425906e766a79c4387775e3eb54 100644 --- a/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -27,16 +26,8 @@ Tensor _log_softmax_backward_npu( int64_t dim, const Tensor& self) { SmallVector dimList = {dim}; - // calculate the output size - auto outputSize = input_same_output_size(grad_output); + Tensor grad_input = OpPreparation::ApplyTensor(grad_output); - // construct the output tensor of the NPU - Tensor grad_input = at::empty_with_format( - outputSize, - grad_output.options(), - CalcuOpUtil::get_tensor_npu_format(grad_output)); - - // calculate the output result of the NPU OpCommand cmd; cmd.Name("LogSoftmaxGrad") .Input(grad_output) diff --git a/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp index 2a41f52ee5524b4a05eeddc892e0bb387699551f..94c653e3c0e7b40b8dc7210a6c9970199c520894 100644 --- a/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp index fecc1154825906dff0777edbcc07dac7c06215ef..24dc1c437cb2af7220c3d7ef73fa39787c119b3f 100644 --- a/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp @@ -40,8 +40,6 @@ Tensor& logspace_out_npu( inputs = at::arange(0, steps, at::device(at::kNPU).dtype(at::kFloat)); } - float startAttr = CalcuOpUtil::get_scalar_float_value(start); - float endAttr = CalcuOpUtil::get_scalar_float_value(end); int64_t dtype = 0; if (result.scalar_type() == at::ScalarType::Half) { dtype = 0; @@ -55,8 +53,8 @@ Tensor& logspace_out_npu( cmd.Name("LogSpaceD") .Input(inputs) .Output(result) - .Attr("start", startAttr) - .Attr("end", endAttr) + .Attr("start", start) + .Attr("end", end) .Attr("steps", steps) .Attr("base", static_cast(base)) .Attr("dtype", dtype) diff --git a/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp index 65d807c74478e6a4273d8fbef70add67a0feb489..c4ad2a61518550b21135a3a91a39559886364d9e 100644 --- a/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -49,15 +48,8 @@ Tensor logical_not_npu(const Tensor& self) { } Tensor& logical_not_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - - Tensor result = at::empty_with_format( - self.sizes(), - self.options().dtype(ScalarType::Byte), - CalcuOpUtil::get_tensor_npu_format(self)); - + OpPreparation::CheckMemory({self}, {self}); + Tensor result = OpPreparation::ApplyTensor(self, self.options().dtype(ScalarType::Byte)); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); logical_not_out_npu(result, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp index 26e8490ac8c54ae5df0e945951fddc120b861ab6..c158f0309e2672a911e3144e7e302fbce01bc82b 100644 --- a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/LtKernelNpu.cpp b/src/aten/src/ATen/native/npu/LtKernelNpu.cpp index fae3afb2fc5a53a52687c9ba37203800ac56170a..8e2ab7095eea9d592b3982d2c34c2d9f945af094 100644 --- a/src/aten/src/ATen/native/npu/LtKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LtKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp index 36f73d8c40834e9a7596f03ed18f12a6ec58cefa..e27c68e940dfee6af9651b1062b488133074a61c 100644 --- a/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp @@ -14,7 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -80,10 +80,7 @@ Tensor& masked_fill_out_npu(Tensor& result, const Tensor& self, const Tensor& ma } Tensor& masked_fill_npu_(Tensor& self, const Tensor& mask, const Tensor& value) { - SmallVector inputs = {self, mask, value}; - SmallVector outputs = {self}; - - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, mask, value}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = masked_fill_out_npu(contiguousSelf, contiguousSelf, mask, value); @@ -95,10 +92,7 @@ Tensor& masked_fill_npu_(Tensor& self, const Tensor& mask, const Tensor& value) } Tensor& masked_fill_npu_(Tensor& self, const Tensor& mask, Scalar value) { - SmallVector inputs = {self, mask}; - SmallVector outputs = {self}; - - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, mask}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); diff --git a/src/aten/src/ATen/native/npu/MaskedFillRangeKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaskedFillRangeKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..269c77c9f904e1d8200ab13b6a8873e8cc14082a --- /dev/null +++ b/src/aten/src/ATen/native/npu/MaskedFillRangeKernelNpu.cpp @@ -0,0 +1,74 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ATen/native/npu/utils/OpAdapter.h" + +namespace at { +namespace native { +using namespace at::native::npu; + +void mask_fill_range_check( + const Tensor& self, + const Tensor& start, + const Tensor& end, + const Tensor& value, + int64_t axis){ + int64_t x_dim = self.dim(); + int64_t min = -x_dim; + int64_t max = x_dim - 1; + TORCH_CHECK( + !(axis < min || axis > max), + "axis overfloaw the range, expected in range [", + -x_dim, + " ", + x_dim - 1, + "] "); + TORCH_CHECK( + start.ndimension() == 2 && start.sizes() == end.sizes(), + "Expected noempty 2D start tensor and start' sizes() should be equal end's sizes() "); + TORCH_CHECK( + start.size(0) == value.size(0), + "Expected value.length equal start loop num "); + TORCH_CHECK( + self.scalar_type() == value.scalar_type(), + "value dtype should be equal self dtype !, but value dtype is ", + value.scalar_type(), + " and self dtype is ", + self.scalar_type()); +} + +Tensor masked_fill_range_npu( + const Tensor& self, + const Tensor& start, + const Tensor& end, + const Tensor& value, + int64_t axis){ + mask_fill_range_check(self, start, end, value, axis); + Tensor result = OpPreparation::ApplyTensor(self); + OpCommand cmd; + cmd.Name("MaskedFillRange") + .Input(self) + .Input(start) + .Input(end) + .Input(value) + .Output(result) + .Attr("axis", axis) + .Run(); + return result; +} + +} +} diff --git a/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp index 4bf48e9da064ba44dca448b7410ea627670f3d1b..1f3c8cd185110e7a3dce8ce8bb9f9c33315ef584 100644 --- a/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp b/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp index b887f5efca7ba98cc8265def17043ea765c76989..d05fc30726260ed549e35d9274ff5913593be062 100644 --- a/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp b/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp index 0bd31892938c51d493a11d2af18d5292c78cb6c3..7c9da5ccce796b29a9fd7bd216a16b3745f27985 100644 --- a/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp @@ -14,9 +14,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "c10/npu/OptionsManager.h" #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp b/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp index 7f77c92962d8b03407f2c17053060a2d885c7d08..3131a6148ed77f2b9b3016d40b58689726fb79fa 100644 --- a/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp index 9ec7d29a0aa323b4b94b945b0cb414cba94005e7..821424393afeddddd346d448c3656a6d6b1c671c 100644 --- a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp @@ -15,8 +15,8 @@ // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp index f821d7a4f2d595b8657be2ba23ad116a34e29723..6b7e9358300bbd04336199d685b56de7a358a0f9 100644 --- a/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -55,17 +54,12 @@ Tensor mse_loss_backward_npu( const Tensor& self, const Tensor& target, int64_t reduction) { - // calculate the output size - auto outputSize = input_same_output_size(self); - auto grad_out = grad_output.contiguous(); if (grad_out.dim() == 0) { grad_out.view(1); } - // construct the output tensor of the NPU - Tensor grad_input = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor grad_input = OpPreparation::ApplyTensor(self); mse_loss_backward_out_npu( grad_input, diff --git a/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp b/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp index d73339bab795a4643fc57db19650326c4f709f78..8669010346c89e2a7521ab0f0fce8b53629e207f 100644 --- a/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -66,9 +65,8 @@ Tensor& mse_loss_out_npu( } OpPreparation::CheckOut( {self, target}, - result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + result, + self, outputSize); mse_loss_out_npu_nocheck(result, self, target, reduction); return result; @@ -85,9 +83,7 @@ Tensor mse_loss_npu( } // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self,outputSize); // calculate the output result of the NPU mse_loss_out_npu_nocheck(result, self, target, reduction); diff --git a/src/aten/src/ATen/native/npu/MulKernelNpu.cpp b/src/aten/src/ATen/native/npu/MulKernelNpu.cpp index 8f074001652509301be20560e1fb8ee96fd6791b..c009de29f81b8bec1961ca4c6a5fbd0183820de3 100644 --- a/src/aten/src/ATen/native/npu/MulKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MulKernelNpu.cpp @@ -15,8 +15,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include namespace at { diff --git a/src/aten/src/ATen/native/npu/MvKernelNpu.cpp b/src/aten/src/ATen/native/npu/MvKernelNpu.cpp index 3ee01f50cc04525ca90bd6f94eb0e1063a9393e2..3a09b43910915830761955d4c3074d15b1965120 100644 --- a/src/aten/src/ATen/native/npu/MvKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/MvKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "ATen/native/npu/common/InnerNpuNativeFunction.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/NeKernelNpu.cpp b/src/aten/src/ATen/native/npu/NeKernelNpu.cpp index b42a3fa5acb85a80c8795a64802ec932e1cbfada..4e4c377b7a1ca69228f7e8fa50b401477e7d87a2 100644 --- a/src/aten/src/ATen/native/npu/NeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/NeKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -125,9 +126,7 @@ Tensor ne_npu(const Tensor& self, Scalar other) { Tensor& ne_npu_(Tensor& self, const Tensor& other) { OpPreparation::CastBackToOriFormat(self); OpPreparation::CastBackToOriFormat(other); - SmallVector inputs = {self, other}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, other}, {self}); Tensor result = at::empty_with_format( self.sizes(), @@ -149,10 +148,7 @@ Tensor& ne_npu_(Tensor& self, const Tensor& other) { Tensor& ne_npu_(Tensor& self, Scalar other) { OpPreparation::CastBackToOriFormat(self); - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); Tensor result = at::empty_with_format( self.sizes(), self.options().dtype(ScalarType::Byte), diff --git a/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp b/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp index 3144646899386b8f2f48baa0c85c6af1b0f9a627..393970fc1d5329307541283cd48aee52599b98fa 100644 --- a/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp index 005e993eb867b7d06a238ffc78ed856553aa757b..8308e4763aa4923b358d9ac4f23594b85cc3a159 100644 --- a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp @@ -13,8 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "climits" namespace at { diff --git a/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp index b0722df7324e22ad2c84470949c27622d74be221..158b7c74a1de4e0ecc9cac144b37ec951bf5a9a7 100644 --- a/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp @@ -131,14 +131,7 @@ Tensor normal_npu( const Tensor& mean, double std, Generator* generator) { - // calculate the output size - auto outputSize = input_same_output_size(mean); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, mean.options(), CalcuOpUtil::get_tensor_npu_format(mean)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(mean); normal_out_npu(result, mean, std, generator); return result; @@ -148,14 +141,7 @@ Tensor normal_npu( double mean, const Tensor& std, Generator* generator) { - // calculate the output size - auto outputSize = input_same_output_size(std); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, std.options(), CalcuOpUtil::get_tensor_npu_format(std)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(std); normal_out_npu(result, mean, std, generator); return result; @@ -165,14 +151,7 @@ Tensor normal_npu( const Tensor& mean, const Tensor& std, Generator* generator) { - // calculate the output size - auto outputSize = input_same_output_size(mean); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, mean.options(), CalcuOpUtil::get_tensor_npu_format(mean)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(mean); normal_out_npu(result, mean, std, generator); return result; @@ -199,10 +178,7 @@ Tensor& normal_npu_( double mean, double std, Generator* generator) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = normal_out_npu(contiguousSelf, mean, std, contiguousSelf.sizes(), generator); diff --git a/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp b/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp index 35c98020833dc3abc6e5bb9203c7c9643d234b64..d92440dec002580786506999774f36fdfb2464aa 100644 --- a/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp @@ -56,10 +56,10 @@ Tensor one_hot_npu1(const Tensor& self, int64_t num_classes) { auto outputSize = array_to_small_vector(self.sizes()); outputSize.emplace_back(depth); - Tensor result = at::empty_with_format( + Tensor result = OpPreparation::ApplyTensor( outputSize, self.options().dtype(ScalarType::Int), - CalcuOpUtil::get_tensor_npu_format(self)); + self); SmallVector depthList = {depth}; diff --git a/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp b/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp index f971c598365f6312220337ba3c23cb8971257bff..7f56a61f72d863987c00f653829890d9ceca3d31 100644 --- a/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp @@ -29,13 +29,8 @@ Tensor ones_like_npu( auto result = at::empty_like(self, options, optional_memory_format); return result.fill_(1.); } - // calculate the output size - auto outputSize = input_same_output_size(self); - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, options, CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self, options); // calculate the output result of the NPUc return result.one_(); } diff --git a/src/aten/src/ATen/native/npu/PadKernelNpu.cpp b/src/aten/src/ATen/native/npu/PadKernelNpu.cpp index 43dacfa066681c6126d7966f2c256449ce253567..d2d338f12527d66c3a3dd2d6eb413621981dbee3 100644 --- a/src/aten/src/ATen/native/npu/PadKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/PadKernelNpu.cpp @@ -33,19 +33,13 @@ Tensor& pad_out_npu( .Input(paddingsVector) .Output(output) .Run(); - return output; } Tensor pad_npu(const Tensor& input, IntArrayRef paddings) { - // calculate the output size auto outputSize = pad_npu_output_size(input, paddings); - - // construct the output tensor of the NPU - Tensor output = at::empty_with_format(outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input)); - + Tensor output = OpPreparation::ApplyTensor(input, outputSize); pad_out_npu(output, input, paddings); - return output; } diff --git a/src/aten/src/ATen/native/npu/PowKernelNpu.cpp b/src/aten/src/ATen/native/npu/PowKernelNpu.cpp index af4d14f31140df71ed31cb4cc97c6bdab8c3503d..01b31ebbc24a7be53a98c65a0280b607853c3c78 100644 --- a/src/aten/src/ATen/native/npu/PowKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/PowKernelNpu.cpp @@ -57,28 +57,13 @@ Tensor& pow_out_npu(Tensor& result, Scalar self, const Tensor& exp) { Tensor pow_npu(const Tensor& self, const Tensor& exp) { // calculate the output size auto outputSize = broadcast_ops_npu_output_size(self, exp); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self, outputSize); pow_out_npu(result, self, exp); return result; } Tensor pow_npu(const Tensor& self, Scalar exp) { - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); pow_out_npu(result, self, exp); return result; } @@ -95,10 +80,7 @@ Tensor pow_npu(Scalar self, const Tensor& exp) { } Tensor& pow_npu_(Tensor& self, const Tensor& exp) { - SmallVector inputs = {self, exp}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self, exp}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); pow_out_npu(contiguousSelf, contiguousSelf, exp); @@ -111,10 +93,7 @@ Tensor& pow_npu_(Tensor& self, const Tensor& exp) { } Tensor& pow_npu_(Tensor& self, Scalar exp) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); pow_out_npu(contiguousSelf, contiguousSelf, exp); diff --git a/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp index c3093b73b298e8d337e2f78f8960b071ec2f81da..14341ed281974b40ec6bc89fc90a5e06c251b1b5 100644 --- a/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -41,14 +40,9 @@ tuple prelu_backward_npu( const Tensor& grad_output, const Tensor& self, const Tensor& weight) { - // calculate the output size - auto outputSizes1 = input_same_output_size(self); - auto outputSizes2 = input_same_output_size(weight); // construct the output tensor of the NPU - Tensor grad_input = at::empty_with_format( - outputSizes1, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - Tensor grad_weight = at::empty_with_format( - outputSizes2, weight.options(), CalcuOpUtil::get_tensor_npu_format(weight)); + Tensor grad_input = OpPreparation::ApplyTensor(self); + Tensor grad_weight = OpPreparation::ApplyTensor(weight); // calculate the output result of the NPU prelu_backward_out_npu(grad_input, grad_weight, grad_output, self, weight); return std::tie(grad_input, grad_weight); diff --git a/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp b/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp index ff9afacad160cb5dace14df3a1ae813cd57b0fb8..52b248ae36364df35ba901f8a936b037b7d6376f 100644 --- a/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -25,8 +24,7 @@ Tensor prelu_npu(const Tensor& self, const Tensor& weight_) { // calculate the output size auto outputSize = input_same_output_size(self); - Tensor result = at::empty_with_format( - outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input)); + Tensor result = OpPreparation::ApplyTensor(input, outputSize); OpCommand cmd; cmd.Name("PRelu") diff --git a/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp b/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp index fd45dc14b60d3fdfff6661bea6c26880b9b85d03..5d256bcc7d52a9cdece25eaccdc343a323e75638 100644 --- a/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -73,14 +74,18 @@ Tensor& prod_out_npu( // fp16 transform:fp32 for precise if (self.scalar_type() == ScalarType::Half) { Tensor result_tmp = prod_npu(self, dim, keepdim, dtype); - OpPreparation::CheckOut({result_tmp}, result, result_tmp); + OpPreparation::CheckOut( + {result_tmp}, + result, + ACL_FORMAT_ND, + result_tmp.scalar_type(), + result_tmp.sizes()); result.copy_(result_tmp); return result; } else { auto outputSize = prod_npu_output_size(self, dim, keepdim); ScalarType dstType = dtype.has_value() ? dtype.value() : self.scalar_type(); - int64_t npu_format = calculate_prod_output_format(self, outputSize); - OpPreparation::CheckOut({self}, result, npu_format, dstType, outputSize); + OpPreparation::CheckOut({self}, result, ACL_FORMAT_ND, dstType, outputSize); prod_out_npu_nocheck(result, self, {dim}, keepdim, dtype); return result; diff --git a/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp b/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp index 4621b5e05fdd4c7966e74b45cbf7de4b01311e5b..2875adc7aff695009560413b3f9e5a38e7cfb3ea 100644 --- a/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -25,12 +24,8 @@ Tensor ptiou_npu( const Tensor& bboxes, const Tensor& gtboxes, int64_t mode) { - // calculate the output size auto outputSize = {gtboxes.size(0), bboxes.size(0)}; - - // construct the output tensor of the NPU - Tensor overlap = at::empty_with_format(outputSize, bboxes.options(), CalcuOpUtil::get_tensor_npu_format(bboxes)); - + Tensor overlap = OpPreparation::ApplyTensor(bboxes, outputSize); string modeStr = "iou"; if (mode == 1) { modeStr = "iof"; diff --git a/src/aten/src/ATen/native/npu/QrKernelNpu.cpp b/src/aten/src/ATen/native/npu/QrKernelNpu.cpp index 141a8faf4dabdf387fe2127c58371839b20ebc6e..a1212802cff3b79915fd17c36ea4e7eac94ebfca 100644 --- a/src/aten/src/ATen/native/npu/QrKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/QrKernelNpu.cpp @@ -76,14 +76,12 @@ std::tuple qr_out_npu( OpPreparation::CheckOut( {self}, Q, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + self, std::get<0>(sizes)); OpPreparation::CheckOut( {self}, R, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), + self, std::get<1>(sizes)); return qr_out_npu_nocheck(Q, R, self, some); } diff --git a/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp b/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp index ec17afdd76204a2f031647ffdd656e8cc85af450..4e05a90658c384cc29c4a63c2cb2358b30bb4957 100644 --- a/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -33,13 +33,8 @@ std::tuple random_choice_with_mask_npu( self.dim()); TORCH_CHECK(count > 0, "The count must greater than 0, but get", count); - Tensor result = at::empty_with_format( - {count, self.dim()}, - self.options().dtype(kInt), - CalcuOpUtil::get_tensor_npu_format(self)); - Tensor mask = at::empty_with_format( - {count}, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor({count, self.dim()}, self.options().dtype(kInt), self); + Tensor mask = OpPreparation::ApplyTensor(self, {count}); OpCommand cmd; cmd.Name("RandomChoiceWithMask") .Input(self) diff --git a/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp b/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp index ae77cd8eaf9b2de87f1615aa7a06c7233aa48e97..3445ba27e8e0a795c0197796d457ca8ff08836c0 100644 --- a/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp @@ -15,8 +15,7 @@ // limitations under the License. #include -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -42,9 +41,7 @@ Tensor& random_npu_(Tensor& self, int64_t from, int64_t to, Generator* gen_) { selfCopy = self.npu_dtype_cast(ScalarType::Float); } - SmallVector inputs = {selfCopy}; - SmallVector outputs = {selfCopy}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({selfCopy}, {selfCopy}); if (!NpuUtils::check_match(&selfCopy)) { Tensor contiguousSelf = NpuUtils::format_contiguous(selfCopy); diff --git a/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp b/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp index bb063be6586a0288bcdf160fce4c477227b90916..4b638919a88b0c48db9cc5a535c3d9f344bb28a3 100644 --- a/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp b/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp index fff9bcb7010a35ac9ec26198eab9a2fd37b7b4a7..ed546a6685669bd6f3bf59b7285d52ba9fea24db 100644 --- a/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp @@ -34,9 +34,7 @@ Tensor& reciprocal_out_npu(Tensor& result, const Tensor& self) { OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), - self.scalar_type(), - self.sizes()); + self); OpPipeWithDefinedOut pipe; return pipe.CheckMemory({self}, {result}) @@ -46,9 +44,7 @@ Tensor& reciprocal_out_npu(Tensor& result, const Tensor& self) { Tensor reciprocal_npu(const Tensor& self) { // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU reciprocal_out_npu_nocheck(result, self); diff --git a/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp b/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp index 54bd128ef8b417e68a4282d71dc392dc609dd348..62f91fd927a542df6366aa4059b4af75af37e72c 100644 --- a/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp b/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp index eba096f9c3427b8aa97e2800026fcb24111f59eb..4c41f0dc912229c7ba7ac4dabbcd77161b4766c5 100644 --- a/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "c10/npu/OptionsManager.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/RollKernelNpu.cpp b/src/aten/src/ATen/native/npu/RollKernelNpu.cpp index b392679e4a858021c49cdbaaf9f916fbfa573d13..4def38d46343f8f514a9cad5aeb0182e4ea92757 100644 --- a/src/aten/src/ATen/native/npu/RollKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RollKernelNpu.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp b/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp index e708ef67b6c216fc2093fe883956902ae518c0b8..996968cc691819032297f22ebd0ab5afc33899c3 100644 --- a/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -39,16 +38,8 @@ Tensor& round_out_npu(Tensor& result, const Tensor& self) { } Tensor round_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); round_out_npu_nocheck(result, self); - return result; } diff --git a/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp b/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp index e0c477abb0585c6109536b9e68bd4e70573c1f78..462e35ef87a714e5387aca961bef04de7c0d02f7 100644 --- a/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -39,14 +38,7 @@ Tensor& rsqrt_out_npu(Tensor& result, const Tensor& self) { } Tensor rsqrt_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); rsqrt_out_npu_nocheck(result, self); return result; diff --git a/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp b/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp index e5978640872c15cbebe175ccf6e49181eef5a422..cb2fa4c663e7aadb9f0199744fabffbba433246f 100644 --- a/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp b/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp index c7a0e166c0d7f42ec8fede554c131eb216f35644..86706651710f7fc91b6af77058b7380a675c8d0a 100644 --- a/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include namespace at { namespace native { @@ -51,11 +51,7 @@ Tensor& scatter_add_npu_( int64_t dim, const Tensor& index, const Tensor& src) { - - SmallVector inputs = {self, index, src}; - SmallVector outputs = {self}; - - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self, index, src}, {self}); ScalarType selfType = self.scalar_type(); Tensor selfFp32 = self; diff --git a/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp b/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp index 30ddb28d6ef2cab91e1c19e0238dd930330d3a8b..ad434470db778052830eef597dc33bd5a6756fff 100644 --- a/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -73,15 +72,12 @@ Tensor& scatter_npu_( index = index.npu_dtype_cast(ScalarType::Float); } - // get float from scalar - float src_value = CalcuOpUtil::get_scalar_float_value(src); - OpCommand cmd; cmd.Name("ScatterScalar") .Input(index) .Output(self) .Attr("dim", dim) - .Attr("value", src_value) + .Attr("value", src) .Run(); if(self.scalar_type() != selfType){ diff --git a/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp b/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp index af0e46a84af3366bc87e25a56193fcbfac06d778..3485b0608a1a9d1b591dcba3fc1466e5ed388410 100644 --- a/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -40,9 +39,7 @@ Tensor& scatter_out_npu( } Tensor scatter_npu(const Tensor& self, const Tensor& indices, const Tensor& updates, int64_t dim) { - Tensor outputs = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor outputs = OpPreparation::ApplyTensor(self); scatter_out_npu(outputs, self, indices, updates, dim); return outputs; diff --git a/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp b/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp index ad20aa78246042201366654058ba4de6a5cc9fe7..f6dc97b52a65eb9c9bac79edd3949dcfcf69921a 100644 --- a/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp @@ -40,10 +40,7 @@ Tensor selu_npu(const Tensor& self) { } Tensor& selu_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = selu_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp index d56e4c0351280b52b625cfac2b2e46b9cf577c99..a538a6182267697cb7bec019908edd0e4dfb0217 100644 --- a/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp b/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp index 785bda4421de7abce8a35cea907721c41f2bfe19..132c14639cb55c85ca783c55f8385e603cc87d27 100644 --- a/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/SignKernelNpu.cpp b/src/aten/src/ATen/native/npu/SignKernelNpu.cpp index 1c2aef9b287c5261494517ca55e04f101b65979c..5f49336fc80c972d4a36319db8df22ac07c07335 100644 --- a/src/aten/src/ATen/native/npu/SignKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SignKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -32,13 +31,8 @@ Tensor& sign_out_npu(Tensor& result, const Tensor& self) { } Tensor sign_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU sign_out_npu(result, self); @@ -46,10 +40,7 @@ Tensor sign_npu(const Tensor& self) { } Tensor& sign_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = sign_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp b/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp index 6bbc4464c6c5c959669e97b38078eaeab7f3636a..a01380fb382a57746bbc31bbaf7f4faf2c72e476 100644 --- a/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp @@ -40,10 +40,7 @@ Tensor sinh_npu(const Tensor& self) { } Tensor& sinh_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = sinh_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp b/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp index 92b223417ad1bed35b80f592864c2e4724d39880..4779f9418561b7339feeba7e8100bb9d41652e4e 100644 --- a/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp b/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp index 696bf6f83b8de69c71e711ce6688c248c5fb711f..4a269947501b1b1d5cb7f478736934b0e2110948 100644 --- a/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp @@ -13,8 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -69,8 +69,7 @@ Tensor smooth_l1_loss_npu( auto outputSize = smooth_l1_loss_npu_output_size(self, target, reduction); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self, outputSize); // calculate the output result of the NPU smooth_l1_loss_out_npu_nocheck(result, self, target, reduction); diff --git a/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp index b27d36380fe6ad5762b79c059708d12779f79fcb..60a6b4cbf43406d9562de8f3059ff6e1e1859308 100644 --- a/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -52,12 +51,7 @@ Tensor soft_margin_loss_backward_npu( const Tensor& input, const Tensor& target, int64_t reduction) { - // calculate the output size - auto outputSize = input_same_output_size(input); - - // construct the output tensor of the NPU - Tensor grad_input = at::empty_with_format( - outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input)); + Tensor grad_input = OpPreparation::ApplyTensor(input); soft_margin_loss_backward_out_npu( grad_input, grad_output, input, target, reduction); return grad_input; diff --git a/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp index c4ca2324de37789f84937a4d542232314137daa0..6db238b5b45041db5ce3dab6539ef3a726a20d47 100644 --- a/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp index 42d2b41ebf9db46c58066e717f6f1d6230b8faf5..cf76b1c9a0e010721c444bc160c02ee372ae0240 100644 --- a/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/SortKernelNpu.cpp b/src/aten/src/ATen/native/npu/SortKernelNpu.cpp index 92648200c0319d5291117720acbc282e8585bab1..cae5fe981dfa683cc0358c3c4a350b3b483963b0 100644 --- a/src/aten/src/ATen/native/npu/SortKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SortKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp b/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp index ce51176358b0d02db526af0511599f1d4edf60b8..924d3e511e8169eaf583a6b86b2355fc292cbe8d 100644 --- a/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -43,8 +42,7 @@ Tensor sort_without_indices_npu( bool descending) { auto outputSize = input_same_output_size(self); - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self); sort_without_indices_out_npu(result, self, dim, descending); diff --git a/src/aten/src/ATen/native/npu/StackKernelNpu.cpp b/src/aten/src/ATen/native/npu/StackKernelNpu.cpp index 4a774c0aad05fdbfee6dedaf9de9e42af285aa08..16973b844e89100799877bfd9af662935669bbda 100644 --- a/src/aten/src/ATen/native/npu/StackKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/StackKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp index 7d6490ffcc7ea6354b782d082ce9be6a2c4d823d..c919f9cd027da080114d99707dc58187f159e3a9 100644 --- a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp @@ -15,36 +15,26 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { using namespace at::native::npu; -Tensor& std_out_npu( - Tensor& result, - const Tensor& self, - DimnameList dim, - bool unbiased, - bool keepdim) { - return std_out_npu(result, self, dimnames_to_positions(self, dim), unbiased, keepdim); -} - -Tensor& std_out_npu( - Tensor& result, +tuple std_mean_out_npu_nocheck( + Tensor& resultStd, + Tensor& resultMean, const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { - auto outputSize = std_npu_output_size(self, dim, keepdim); - Tensor meanResult = OpPreparation::ApplyTensor(self, std::get<1>(outputSize)); - - // executing the NPU operator - if (!c10::npu::OptionsManager::CheckDynamicEnable()) { + // executing the NPU operator + if (!c10::npu::OptionsManager::CheckDynamicEnable()) { OpCommand cmd; cmd.Name("ReduceStd") .Input(self) - .Output(result) - .Output(meanResult) + .Output(resultStd) + .Output(resultMean) .Attr("dim", dim) .Attr("unbiased", unbiased) .Attr("keepdim", keepdim) @@ -53,25 +43,60 @@ Tensor& std_out_npu( OpCommand cmd1; cmd1.Name("ReduceMeanD") .Input(self) - .Output(meanResult) + .Output(resultMean) .Attr("axes", dim) .Attr("keep_dims", keepdim) .Run(); - if (meanResult.dim() != 0 && keepdim == false) { - meanResult = meanResult.unsqueeze(dim[0]); + Tensor resultMeanCopy = resultMean; + if (resultMean.dim() != 0 && keepdim == false) { + auto dimVector = array_to_small_vector(dim); + std::sort(dimVector.begin(), dimVector.end()); + for (int64_t i = 0; i < dimVector.size(); i++) { + resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]); + } } - Tensor meanResult2 = meanResult.expand(self.sizes()); + resultMeanCopy = resultMeanCopy.expand(self.sizes()); OpCommand cmd2; cmd2.Name("ReduceStdWithMean") .Input(self) - .Input(meanResult2) - .Output(result) + .Input(resultMeanCopy) + .Output(resultStd) .Attr("dim", dim) .Attr("unbiased", unbiased) .Attr("keepdim", keepdim) .Run(); } + return std::tie(resultStd, resultMean); +} + +Tensor& std_out_npu( + Tensor& result, + const Tensor& self, + DimnameList dim, + bool unbiased, + bool keepdim) { + return std_out_npu(result, self, dimnames_to_positions(self, dim), unbiased, keepdim); +} + +Tensor& std_out_npu( + Tensor& result, + const Tensor& self, + IntArrayRef dim, + bool unbiased, + bool keepdim) { + auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim); + Tensor meanResult = OpPreparation::ApplyTensor(self, outputSize); + + OpPreparation::CheckOut( + {self}, + result, + self, + outputSize); + + // executing the NPU operator + std_mean_out_npu_nocheck(result, meanResult, self, dim, unbiased, keepdim); + return result; } @@ -82,40 +107,21 @@ tuple std_mean_out_npu( IntArrayRef dim, bool unbiased, bool keepdim) { - // executing the NPU operator - if (!c10::npu::OptionsManager::CheckDynamicEnable()) { - OpCommand cmd; - cmd.Name("ReduceStd") - .Input(self) - .Output(result1) - .Output(result2) - .Attr("dim", dim) - .Attr("unbiased", unbiased) - .Attr("keepdim", keepdim) - .Run(); - } else { - OpCommand cmd1; - cmd1.Name("ReduceMeanD") - .Input(self) - .Output(result2) - .Attr("axes", dim) - .Attr("keep_dims", keepdim) - .Run(); - Tensor result2_copy = result2; - if (result2.dim() != 0 && keepdim == false) { - result2_copy = result2.unsqueeze(dim[0]); - } - result2_copy = result2_copy.expand(self.sizes()); - OpCommand cmd2; - cmd2.Name("ReduceStdWithMean") - .Input(self) - .Input(result2_copy) - .Output(result1) - .Attr("dim", dim) - .Attr("unbiased", unbiased) - .Attr("keepdim", keepdim) - .Run(); - } + auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim); + + OpPreparation::CheckOut( + {self}, + result1, + self, + outputSize); + OpPreparation::CheckOut( + {self}, + result2, + self, + outputSize); + + // executing the NPU operator + std_mean_out_npu_nocheck(result1, result2, self, dim, unbiased, keepdim); return std::tie(result1, result2); } @@ -126,11 +132,11 @@ Tensor std_dim_npu( bool unbiased, bool keepdim) { // calculate the output size - auto outputSize = std_npu_output_size(self, dim, keepdim); + auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim); // construct the output tensor of the NPU - Tensor result1 = OpPreparation::ApplyTensor(self, std::get<0>(outputSize)); - Tensor result2 = OpPreparation::ApplyTensor(self, std::get<1>(outputSize)); + Tensor result1 = OpPreparation::ApplyTensor(self, outputSize); + Tensor result2 = OpPreparation::ApplyTensor(self, outputSize); // calculate the output result of the NPU std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim); @@ -157,11 +163,11 @@ tuple std_mean_dim_npu( bool unbiased, bool keepdim) { // calculate the output size - auto outputSize = std_npu_output_size(self, dim, keepdim); + auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim); // construct the output tensor of the NPU - Tensor result1 = OpPreparation::ApplyTensor(self, std::get<0>(outputSize)); - Tensor result2 = OpPreparation::ApplyTensor(self, std::get<1>(outputSize)); + Tensor result1 = OpPreparation::ApplyTensor(self, outputSize); + Tensor result2 = OpPreparation::ApplyTensor(self, outputSize); // calculate the output result of the NPU std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim); diff --git a/src/aten/src/ATen/native/npu/SubKernelNpu.cpp b/src/aten/src/ATen/native/npu/SubKernelNpu.cpp index 70872bda6d027933ee902325e357cb93bb8d4f4d..de4fc4676fa056e02cccd602e8f844cd2d1231db 100644 --- a/src/aten/src/ATen/native/npu/SubKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SubKernelNpu.cpp @@ -15,8 +15,8 @@ // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp b/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp index 3af8d7df4bb761879fc5f84c971af843d2a45356..de393e901ee9f05d233edbc37cd471e23dcf9fc6 100644 --- a/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp @@ -20,9 +20,7 @@ using namespace at::native::npu; Tensor sub_sample_npu(const Tensor &self, int64_t per_images, double positive_fraction) { - Tensor result = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); OpCommand cmd; cmd.Name("SubSample") .Input(self) @@ -30,7 +28,6 @@ Tensor sub_sample_npu(const Tensor &self, int64_t per_images, .Attr("batch_size_per_images", per_images) .Attr("positive_fraction", (float)positive_fraction) .Run(); - return result; } diff --git a/src/aten/src/ATen/native/npu/SumKernelNpu.cpp b/src/aten/src/ATen/native/npu/SumKernelNpu.cpp index 739d5992bb6d949d66ea6697304d6e4b54126b35..9a8f72225f8801a2682ba7dc161358ad39633407 100644 --- a/src/aten/src/ATen/native/npu/SumKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/SumKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -116,7 +117,7 @@ Tensor& sum_out_npu( OpPreparation::CheckOut( {self}, result, - CalcuOpUtil::get_tensor_npu_format(self), + ACL_FORMAT_ND, dstType, outputSize); diff --git a/src/aten/src/ATen/native/npu/TanKernelNpu.cpp b/src/aten/src/ATen/native/npu/TanKernelNpu.cpp index 06194c74a806b0d71bf3c1bb2363b66b27b13c24..316263404daf412bb2abb82afc7ea7b66763ad30 100644 --- a/src/aten/src/ATen/native/npu/TanKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/TanKernelNpu.cpp @@ -32,23 +32,13 @@ Tensor& tan_out_npu(Tensor& result, const Tensor& self) { } Tensor tan_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); tan_out_npu(result, self); return result; } Tensor& tan_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = tan_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp index 5ce4d71e14f19be5aa3211fc1b2bb7a04108dc4d..2b2328ee63461383851a48b7bc6188e346f0cec7 100644 --- a/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -45,9 +44,7 @@ Tensor& tanh_backward_out_npu( } Tensor tanh_backward_npu(const Tensor& grad_output, const Tensor& self) { - Tensor result = at::empty_with_format( - self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); tanh_backward_out_npu_nocheck(result, grad_output, self); return result; diff --git a/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp b/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp index 07dd8c7a13d08922ce5c58dc6b7ea87019c72d68..98bbd5a5a5f47131c5c5a83c2165022b2edf3731 100644 --- a/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -32,13 +31,7 @@ Tensor& tanh_out_npu(Tensor& result, const Tensor& self) { } Tensor tanh_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // calculate the output result of the NPU tanh_out_npu(result, self); @@ -46,10 +39,7 @@ Tensor tanh_npu(const Tensor& self) { } Tensor& tanh_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = tanh_out_npu(contiguousSelf, contiguousSelf); diff --git a/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp index 6df88d299822108da597c78ab0d0e1f198ef8bb8..ca01f2dd61ecd41e8111031f470374a1f1c479fe 100644 --- a/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp b/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp index a90f3f4ae5c362650b772e82081ad187d2334769..103b4fa043623d0bc1bebf0e9069a40306ce2896 100644 --- a/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -38,14 +37,7 @@ Tensor& threshold_out_npu( } Tensor threshold_npu(const Tensor& self, Scalar threshold, Scalar value) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); threshold_out_npu(result, self, threshold, value); return result; } diff --git a/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp b/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp index 5b6ac1f3e4bcac131f87a988fb50914d4faa2bdd..4ef0a3fc43be85036fe7bb67071c499d06a6e7da 100644 --- a/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp b/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp index e9e0730b7971e9be91e5fbb52012bab0f104bf86..3429850d8eaec7fb747507bd8d3706777778cea3 100644 --- a/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include namespace at { @@ -49,14 +50,8 @@ Tensor& transpose_out_npu( } Tensor transpose_npu(const Tensor& self, IntArrayRef perm) { - // calculate the output size auto outputSize = transpose_npu_output_size(self, perm); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self, outputSize); transpose_out_npu(result, self, perm); return result; diff --git a/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp b/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp index f8faa79eb1cd8522eae395b8fa42ea4c0ca3f223..678854c4a391045571ecfa69175acda9d8ecdde1 100644 --- a/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/OpTemplate.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -42,19 +41,13 @@ Tensor tril_npu(const Tensor& self, int64_t diagonal){ }; TORCH_CHECK(is_last_two_dims(), "tril require tensor should be last two dims"); - - auto outputSize = input_same_output_size(selfCopy); - Tensor result = at::empty_with_format(outputSize,selfCopy.options(), - CalcuOpUtil::get_tensor_npu_format(selfCopy)); + Tensor result = OpPreparation::ApplyTensor(selfCopy); tril_out_npu(result, selfCopy, diagonal); return result; } Tensor& tril_npu_(Tensor& self, int64_t diagonal){ - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); self.npu_format_cast_(ACL_FORMAT_NCHW); if(!NpuUtils::check_match(&self)){ Tensor contiguousSelf = NpuUtils::format_contiguous(self); diff --git a/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp b/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp index a9957ab8137cabeaae9c9a8ca8cf59ef77187997..6de531b25e89c6a1c01760636f9d2fbc54dde175 100644 --- a/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -32,10 +31,7 @@ Tensor& trunc_out_npu(Tensor& result, const Tensor& self) { } Tensor& trunc_npu_(Tensor& self) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = trunc_out_npu(contiguousSelf, contiguousSelf); @@ -48,16 +44,7 @@ Tensor& trunc_npu_(Tensor& self) { } Tensor trunc_npu(const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - self.options(), - CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); trunc_out_npu(result, self); return result; } diff --git a/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp b/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp index 35814aab9614e1dd793b1dc25afb5969197adb91..ef248af128771c7bbe1e42b20ef5d3465f1d5e7b 100644 --- a/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp @@ -13,7 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -37,9 +37,7 @@ Tensor& uniform_out_npu( } Tensor& uniform_npu_(Tensor& self, double from, double to, Generator* gen_) { - SmallVector inputs = {self}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); + OpPreparation::CheckMemory({self}, {self}); // TODO: The operator needs to use fp32 for calculation. Tensor selfCopy = self; diff --git a/src/aten/src/ATen/native/npu/UpSampleNearest3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpSampleNearest3dKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e90e7f0df5a677eb1504c294b732a0f9b3a36676 --- /dev/null +++ b/src/aten/src/ATen/native/npu/UpSampleNearest3dKernelNpu.cpp @@ -0,0 +1,72 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ATen/native/npu/utils/OpAdapter.h" + +namespace at { +namespace native { +using namespace at::native::npu; + +Tensor& upsample_nearest3d_out_npu( + Tensor& result, + const Tensor& input, + IntArrayRef output_size, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + + result.resize_({nbatch, channels, output_depth, output_height, output_width}); + + OpCommand cmd; + cmd.Name("UpsampleNearest3d") + .Input(input) + .Output(result) + .Attr("output_size", output_size) + .Run(); + + return result; +} + +Tensor upsample_nearest3d_npu( + const Tensor& input, + IntArrayRef output_size, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + + Tensor result = OpPreparation::ApplyTensor(input, {1}); + + upsample_nearest3d_out_npu(result, input, output_size, scales_d, scales_h, scales_w); + + return result; +} + +} // namespace native +} // namespace at diff --git a/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp index c688e5496b12770d397ebd99aef5d449c90b8884..de10ef61d6d712b82406dad08c52ca834aa63f1d 100644 --- a/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp index a4b93e533016bf91660c7981710dcdeccd718ad5..102716632995ba88837c3c7e8e41f392183e7ded 100644 --- a/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -51,8 +50,7 @@ Tensor upsample_nearest1d_backward_npu( grads = grad_output.to(at::kFloat); } - Tensor grad_input = at::empty_with_format( - input_size, grads.options(), CalcuOpUtil::get_tensor_npu_format(grad_output)); + Tensor grad_input = OpPreparation::ApplyTensor(input_size, grads.options(), grad_output); upsample_nearest1d_backward_out_npu( grad_input, grads, output_size, input_size, scales); diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp index bd1596502bee4ae59fdedfc966dfc93e0bfc1e75..944f92fd8295f24439392940b7e7bd8e8fcbaa7f 100644 --- a/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -67,8 +66,7 @@ Tensor upsample_nearest1d_npu( SmallVector outputSize = upsample_nearest1d_npu_output_size(self, output_size, scales); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result = OpPreparation::ApplyTensor(self, outputSize); // calculate the output result of the NPU upsample_nearest1d_out_npu(result, self, output_size, scales); diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp index 9c5adc9af1d706bce4fe665cf9effd8bed3703b7..09ec3e2e28416ff7e72f9cb5289903c987cc58f5 100644 --- a/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp @@ -51,8 +51,8 @@ Tensor upsample_nearest2d_backward_npu( grads = grad_output.to(at::kFloat); } - Tensor grad_input = at::empty_with_format( - input_size, grads.options(), CalcuOpUtil::get_tensor_npu_format(grad_output)); + Tensor grad_input = OpPreparation::ApplyTensor( + input_size, grads.options(), grad_output); upsample_nearest2d_backward_out_npu( grad_input, grads, output_size, input_size, scales_h, scales_w); diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp index d759bd0211bde5bb63f38a17cda711768bf34ad1..98626777078f901881f4465dd280d6e1feeeee02 100644 --- a/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest3dBackwardKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a9d724da90aba4db7629bd5487ef20e7c812eb6b --- /dev/null +++ b/src/aten/src/ATen/native/npu/UpsampleNearest3dBackwardKernelNpu.cpp @@ -0,0 +1,82 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "ATen/native/npu/utils/OpAdapter.h" + +namespace at { +namespace native { +using namespace at::native::npu; + +Tensor& upsample_nearest3d_backward_out_npu( + Tensor& grad_input, + const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 5, + "It is expected input_size equals to 5, but got size ", + input_size.size()); + + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_depth = input_size[2]; + int64_t input_height = input_size[3]; + int64_t input_width = input_size[4]; + + grad_input.resize_( + {nbatch, channels, input_depth, input_height, input_width}); + + OpCommand cmd; + cmd.Name("UpsampleNearest3dGrad") + .Input(grad_output) + .Output(grad_input) + .Attr("input_size", input_size) + .Attr("output_size", output_size) + .Run(); + + return grad_input; +} + +Tensor upsample_nearest3d_backward_npu( + const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + + Tensor grad_input = OpPreparation::ApplyTensor(grad_output, input_size); + + upsample_nearest3d_backward_out_npu(grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w); + + return grad_input; +} + +} // namespace native +} // namespace at \ No newline at end of file diff --git a/src/aten/src/ATen/native/npu/UpsampleTrilinear3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dBackwardKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1a847ef2bd3007d20dfabc8f6da3e27654772e86 --- /dev/null +++ b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dBackwardKernelNpu.cpp @@ -0,0 +1,85 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "ATen/native/npu/utils/OpAdapter.h" + +namespace at { +namespace native { +using namespace at::native::npu; + +Tensor& upsample_trilinear3d_backward_out_npu( + Tensor& grad_input, + const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 5, + "It is expected input_size equals to 5, but got size ", + input_size.size()); + + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_depth = input_size[2]; + int64_t input_height = input_size[3]; + int64_t input_width = input_size[4]; + + grad_input.resize_( + {nbatch, channels, input_depth, input_height, input_width}); + + OpCommand cmd; + cmd.Name("UpsampleTrilinear3dGrad") + .Input(grad_output) + .Output(grad_input) + .Attr("input_size", input_size) + .Attr("output_size", output_size) + .Attr("align_corners", align_corners) + .Run(); + + return grad_input; +} + +Tensor upsample_trilinear3d_backward_npu( + const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + + Tensor grad_input = OpPreparation::ApplyTensor(grad_output, input_size); + + upsample_trilinear3d_backward_out_npu(grad_input, grad_output, output_size, input_size, align_corners, scales_d, scales_h, scales_w); + + return grad_input; +} + +} // namespace native +} // namespace at diff --git a/src/aten/src/ATen/native/npu/UpsampleTrilinear3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..919c3fcdf768fa517e7e4a5e9fd29dfd71c898a1 --- /dev/null +++ b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dKernelNpu.cpp @@ -0,0 +1,76 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "ATen/native/npu/utils/OpAdapter.h" + +namespace at { +namespace native { +using namespace at::native::npu; + +Tensor& upsample_trilinear3d_out_npu( + Tensor& result, + const Tensor& input, + IntArrayRef output_size, + bool align_corners, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + + result.resize_({nbatch, channels, output_depth, output_height, output_width}); + + OpCommand cmd; + cmd.Name("UpsampleTrilinear3d") + .Input(input) + .Output(result) + .Attr("output_size", output_size) + .Attr("align_corners", align_corners) + .Run(); + + return result; +} + +Tensor upsample_trilinear3d_npu( + const Tensor& input, + IntArrayRef output_size, + bool align_corners, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + + Tensor result = OpPreparation::ApplyTensor(input, {1}); + + upsample_trilinear3d_out_npu(result, input, output_size, align_corners, scales_d, scales_h, scales_w); + + return result; +} + +} // namespace native +} // namespace at diff --git a/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp b/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp index fcc771e143b9842874dfd898e8020c7739a644b8..9c025b9650ebc3d8d810080d207aa26045a76b11 100644 --- a/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -49,13 +48,7 @@ Tensor _s_where_npu( const Tensor& condition, const Tensor& self, const Tensor& other) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); // maskrcnn need dynamicshape function of op "SelectV2" string opName = c10::npu::OptionsManager::CheckDynamicEnable() ? "SelectV2" : "Select"; OpCommand cmd; diff --git a/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp index 0fe3fc20d89f600246810f46089793ed12dffaf4..c5c29cbb9640c2f03824493519f6a84a42a36fb9 100644 --- a/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -39,13 +38,7 @@ Tensor zeros_like_npu( auto result = at::empty_like(self, options, optional_memory_format); return result.fill_(0); } - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, options, CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self, options); // calculate the output result of the NPU return result.zero_(); } diff --git a/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp b/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp index df196cfe45fde948b8db96b44ff9e08d5cd20d8d..757bc2e258b143db8fa7b2928c10c4a8357c2feb 100644 --- a/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -49,9 +48,9 @@ tuple _unique2_npu( bool return_inverse, bool return_counts) { if(self.numel() == 0){ - Tensor result= at::empty_with_format({0}, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - Tensor yInverse = at::empty_with_format({0}, self.options().dtype(kLong), CalcuOpUtil::get_tensor_npu_format(self)); - Tensor yCounts = at::empty_with_format({0}, self.options().dtype(kLong), CalcuOpUtil::get_tensor_npu_format(self)); + Tensor result= OpPreparation::ApplyTensor(self, {0}); + Tensor yInverse = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self); + Tensor yCounts = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self); return std::tie(result, yInverse, yCounts); } @@ -64,7 +63,7 @@ tuple _unique2_npu( selfCopy = self.to(ScalarType::Float); } - Tensor y = at::empty_with_format(std::get<0>(outputSizes), selfCopy.options(), CalcuOpUtil::get_tensor_npu_format(selfCopy)); + Tensor y = OpPreparation::ApplyTensor(selfCopy, std::get<0>(outputSizes)); Tensor yOutputSize = at::empty_with_format(std::get<1>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND); Tensor yInverse = at::empty_with_format(std::get<2>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND); Tensor yCounts = at::empty_with_format(std::get<0>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND); diff --git a/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp b/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp index 567f0080fc3dfbd27d9ceb79ef2453453f8ef627..5eab51b18541686083fdbbee3b288eaadeccbdb1 100644 --- a/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -71,25 +71,14 @@ Tensor __and___npu(const Tensor& self, const Tensor& other) { auto outputSize = broadcast_ops_npu_output_size(self, other); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, - outputTensor.options(), - CalcuOpUtil::get_tensor_npu_format(outputTensor)); - + Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize); // calculate the output result of the NPU __and___out_npu(result, self, other); return result; } Tensor __and___npu(const Tensor& self, Scalar other) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); __and___out_npu(result, self, other); return result; diff --git a/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp b/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp index c54448b8c8f6b9045ac494a8d799513b72171c31..e517cdaf856578e4557e7fb0b85f7c591332f8e4 100644 --- a/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp @@ -41,10 +41,7 @@ Tensor& __ior___out_npu(Tensor& result, const Tensor& self, Scalar other) { } Tensor& __ior___npu(Tensor& self, const Tensor& other) { - SmallVector inputs = {self, other}; - SmallVector outputs = {self}; - CalcuOpUtil::check_memory_over_laps(inputs, outputs); - + OpPreparation::CheckMemory({self, other}, {self}); if (!NpuUtils::check_match(&self)) { Tensor contiguousSelf = NpuUtils::format_contiguous(self); Tensor result = __ior___out_npu(contiguousSelf, contiguousSelf, other); diff --git a/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp b/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp index 808024748fd4c749f9da0bcd9dd3e102b485b8e1..89e628e1dc3fcc33113d7ca7e53dffedbe9f37a0 100644 --- a/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { diff --git a/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp b/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp index 79e9f5af3c55f40a2ff43046c94147a67534ed53..15e9827a281c7c595335e4354a5a9a302cdcb1ed 100644 --- a/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp @@ -12,63 +12,28 @@ using namespace at::native::npu; if (dim_tensor1 == 1 && dim_tensor2 == 1) { return tensor1.dot(tensor2); } else if (dim_tensor1 == 2 && dim_tensor2 == 1) { - return tensor1.mv(tensor2); + return tensor1.mm(tensor2.unsqueeze(-1)).squeeze_(-1); } else if (dim_tensor1 == 1 && dim_tensor2 == 2) { return tensor1.unsqueeze(0).mm(tensor2).squeeze_(0); } else if (dim_tensor1 == 2 && dim_tensor2 == 2) { return tensor1.mm(tensor2); } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) { Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2; - Tensor output = dim_tensor2 == 1 ? at::npu_bmmV2(tensor1, t2).squeeze(-1) : at::npu_bmmV2(tensor1, t2); + auto size1 = tensor1.sizes(); + auto size2 = t2.sizes(); + std::vector output_size; + output_size.insert(output_size.end(), size1.begin(), size1.end() - 1); + if (dim_tensor2 > 1) { + output_size.push_back(size2[dim_tensor2 - 1]); + } + // fold the batch into the first dimension + Tensor t1 = tensor1.reshape({-1, tensor1.size(-1)}); + Tensor output = at::_unsafe_view(t1.mm(t2), output_size); return output; - // 需要支持out接口 } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) { - // optimization: transpose the inner dimensions of the arguments, call - // matmul on the swapped arguments, then transpose the inner dimensions - // of the result. - const int64_t n = dim_tensor1 == 2 ? tensor1.size(-2) : 1; - const int64_t m = tensor1.size(-1); - const int64_t p = tensor2.size(-1); - - const Tensor t2_T = tensor2.transpose(-1, -2); - const Tensor t1_T = dim_tensor1 == 2 ? tensor1.t() : tensor1.reshape({n, m}).t(); - const Tensor res_T = matmul_npu(t2_T, t1_T); - - if (dim_tensor1 == 2) { - Tensor res = res_T.transpose(-1, -2).contiguous(); - return res; - } - else { - std::vector shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec(); - shape.push_back(p); - - Tensor res = res_T.reshape(shape).contiguous(); - return res; - } + return at::npu_bmmV2(tensor1, tensor2, {}); } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) { - int64_t n = dim_tensor1 > 1 ? tensor1.size(-2) : 1; - int64_t m1 = tensor1.size(-1); - IntArrayRef batch_tensor1(tensor1.sizes().data(), std::max(dim_tensor1 - 2, 0)); - int64_t m2 = dim_tensor2 > 1 ? tensor2.size(-2) : 1; - int64_t p = tensor2.size(-1); - IntArrayRef batch_tensor2(tensor2.sizes().data(), std::max(dim_tensor2 - 2, 0)); - - // expand the batch portion (i.e. cut off matrix dimensions and expand rest) - std::vector expand_batch_portion = infer_size(batch_tensor1, batch_tensor2); - - std::vector tensor1_expand_size(expand_batch_portion); - tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1}); - - std::vector tensor2_expand_size(expand_batch_portion); - tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p}); - - - // flatten expanded batches - Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size); - Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size); - - Tensor output = at::npu_bmmV2(tensor1_expanded, tensor2_expanded); - return output; + return at::npu_bmmV2(tensor1, tensor2, {}); } AT_ERROR("both arguments to matmul need to be at least 1D, but they are ", dim_tensor1, "D and ", dim_tensor2, "D"); diff --git a/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp index 4374fc1c7ed5dee107f6c0308e502ddfbc06608a..f1cd79906cbef0e93bd36a73524de2bba9a988a6 100644 --- a/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp @@ -90,16 +90,9 @@ Tensor &conv3d_out_npu(Tensor &result, const Tensor &input, Tensor conv3d_npu(const Tensor &input, const Tensor &weight, const Tensor &bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, int64_t groups) { - // calculate the output size - auto outputSize = conv3d_npu_output_size( input, weight, bias, stride, padding, dilation, groups); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(input, outputSize); conv3d_out_npu(result, input, weight, bias, stride, padding, dilation, groups); return result; diff --git a/src/aten/src/ATen/native/npu/frame/OpCommandBase.h b/src/aten/src/ATen/native/npu/frame/OpCommandBase.h index 140170f641de335337b89d9defc7aee5c1c48bab..602b3d7ca733f750f3121e0ce039fbd7ed61c3df 100644 --- a/src/aten/src/ATen/native/npu/frame/OpCommandBase.h +++ b/src/aten/src/ATen/native/npu/frame/OpCommandBase.h @@ -20,7 +20,6 @@ #include "ATen/native/npu/frame/OpCmdHelper.h" #include "ATen/native/npu/frame/OpParamMaker.h" #include "ATen/native/npu/utils/DynamicShapeUtil.h" -#include "ATen/native/npu/utils/CalcuOpUtil.h" #include "ATen/native/npu/utils/NpuUtils.h" #include "THNPU/THNPUCachingHostAllocator.h" namespace at { @@ -121,7 +120,6 @@ class OpCommandBase { } void Run(){ - NpuUtils::SetCompileOptOnce(); if (c10::npu::OptionsManager::CheckQueueEnable()) { ExecuteParas params; aclCmd->ExportParams(params); @@ -211,11 +209,19 @@ class OpCommandBase { return storage.back(); } Tensor CopyHostToDevice(const Scalar& scalar, ScalarType type) { - storage.emplace_back(CalcuOpUtil::CopyScalarToDevice(scalar, type)); - return storage.back(); + auto tensor = scalar_to_tensor(scalar).to(type); + return CopyHostToDevice(tensor); } Tensor CopyHostToDevice(const Tensor& cpuTensor) { - storage.emplace_back(CalcuOpUtil::copy_tensor_host_to_device(cpuTensor)); + Tensor cpuPinMemTensor = cpuTensor.pin_memory(); + int deviceIndex = 0; + AT_NPU_CHECK(aclrtGetDevice(&deviceIndex)); + auto tensor = cpuPinMemTensor.to( + c10::Device(DeviceType::NPU, deviceIndex), + cpuPinMemTensor.scalar_type(), + true, + true); + storage.emplace_back(tensor); return storage.back(); } Tensor CreateHostTensor(void* data, IntArrayRef sizes, diff --git a/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h b/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h index 7cc10880015104739a74e9f2de44e8e5e153f5e6..0fd4d6695499d73d53ac82a2c809dd461f7cf8df 100644 --- a/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h +++ b/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h @@ -19,8 +19,6 @@ #include #include #include "ATen/native/npu/frame/OpParamMaker.h" -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/NpuUtils.h" #include "c10/npu/NPUStream.h" namespace at { diff --git a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp index d285f2b6711ffa85698164f3f2fd46685c2729b3..73dcdd8ec5cccb9633c455dee0a320dd05ea70cc 100644 --- a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp +++ b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp @@ -17,15 +17,131 @@ #include #include "c10/npu/NPUQueue.h" #include -#include "ATen/native/npu/utils/DynamicShapeUtil.h" #include "ATen/native/npu/aoe/AutoTune.h" +#include "ATen/native/npu/utils/DynamicShapeUtil.h" #include "ATen/native/npu/utils/NpuFuzzyBlacklist.h" -#include "ATen/native/GlobalStep.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" +#include "ATen/native/npu/interface/EnvVariables.h" namespace at { namespace native { namespace npu { +void OpAttrMaker::Set(aclopAttr* attr, string name, bool value) { + aclopSetAttrBool(attr, name.c_str(), value); +} + +void OpAttrMaker::Set(aclopAttr* attr, string name, int64_t value) { + aclopSetAttrInt(attr, name.c_str(), value); +} + +void OpAttrMaker::Set(aclopAttr* attr, string name, float value) { + aclopSetAttrFloat(attr, name.c_str(), value); +} + +void OpAttrMaker::Set(aclopAttr* attr, string name, string value) { + aclopSetAttrString(attr, name.c_str(), value.c_str()); +} + +void OpAttrMaker::Set(aclopAttr* attr, string name, IntArrayRef value) { + auto vec = value.vec(); + aclopSetAttrListInt(attr, name.c_str(), vec.size(), vec.data()); +} + +void OpAttrMaker::Set(aclopAttr* attr, string name, at::ArrayRef value) { + auto vec = value.vec(); + aclopSetAttrListFloat(attr, name.c_str(), vec.size(), vec.data()); +} + +void OpAttrMaker::Set(aclopAttr* attr, string name, Scalar value) { + float val = CalcuOpUtil::get_scalar_float_value(value); + aclopSetAttrFloat(attr, name.c_str(), val); +} + + +void OpAttrMaker::Set( + aclopAttr* attr, + string name, + at::ArrayRef value) { + // Pointer to values of each listInt. + SmallVector attrValue; + // Pointer to number of each listInt. + SmallVector eachListIntNum; + // Value of each listInt. + SmallVector, N> eachListIntVal; + for (int i = 0; i < value.size(); i++) { + SmallVector listInt; + int64_t valueSize = value[i].size(); + listInt.resize(valueSize); + std::copy(value[i].begin(), value[i].end(), listInt.begin()); + eachListIntVal.emplace_back(listInt); + attrValue.emplace_back(eachListIntVal.back().data()); + eachListIntNum.emplace_back(valueSize); + } + + aclopSetAttrListListInt( + attr, + name.c_str(), + attrValue.size(), + eachListIntNum.data(), + attrValue.data()); +} + + +void AttrInfoMaker::Add(bool value, string& attrInfo) { + attrInfo += to_string(value) + "-"; +} + +void AttrInfoMaker::Add(int64_t value, string& attrInfo) { + attrInfo += to_string(value) + "-"; +} + +void AttrInfoMaker::Add(float value, string& attrInfo) { + attrInfo += to_string(value) + "-"; +} + +void AttrInfoMaker::Add(string value, string& attrInfo) { + attrInfo += value + "-"; +} + +void AttrInfoMaker::Add(IntArrayRef value, string& attrInfo) { + auto vec = value.vec(); + for (unsigned i = 0; i < vec.size(); i++) + attrInfo += to_string(vec.at(i)) + ","; + attrInfo += "-"; +} + +void AttrInfoMaker::Add( + at::ArrayRef value, + string& attrInfo) { + auto vec = value.vec(); + for (unsigned i = 0; i < vec.size(); i++) + attrInfo += to_string(vec.at(i)) + ","; + attrInfo += "-"; +} + +void AttrInfoMaker::Add(Scalar value, string& attrInfo) { + float val = CalcuOpUtil::get_scalar_float_value(value); + attrInfo += to_string(val) + "-"; +} + +void AttrInfoMaker::Add( + at::ArrayRef value, + string& attrInfo) { + // Pointer to values of each listInt. + SmallVector attrValue; + // Pointer to number of each listInt. + SmallVector eachListIntNum; + // Value of each listInt. + SmallVector, N> eachListIntVal; + for (int i = 0; i < value.size(); i++) { + int64_t valueSize = value[i].size(); + attrInfo += to_string(valueSize) + ","; + } + attrInfo += "-"; +} + + void OpCommandImpl::Run() { InitAttr(); NPU_LOGD("Op %s Run.", opName.c_str()); @@ -40,7 +156,7 @@ aclError OpCommandImpl::InnerRun(string name, AclExecParam& params) { auto inputSize = params.inBuffer.size(); auto outputSize = params.outBuffer.size(); bool reset_flag = false; - if (check_fuzz_enable() && + if (env::CheckFuzzyEnable() && FuzzyCompileBlacklist::GetInstance().IsInBlacklist(name)) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT); reset_flag = true; @@ -73,7 +189,7 @@ int ExecFunc(void* in, aclrtStream stream) { ret = DynamicRun(*cur_paras, stream); } else { bool reset_flag = false; - if (check_fuzz_enable() && + if (env::CheckFuzzyEnable() && FuzzyCompileBlacklist::GetInstance().IsInBlacklist(cur_paras->opType)) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT); reset_flag = true; diff --git a/src/aten/src/ATen/native/npu/frame/OpParamMaker.h b/src/aten/src/ATen/native/npu/frame/OpParamMaker.h index adf2c00f7bdd04ca7547af671069f22f58c5eb67..bf0f28830e26bb9c444fce5e492dbbb10fb5cab2 100644 --- a/src/aten/src/ATen/native/npu/frame/OpParamMaker.h +++ b/src/aten/src/ATen/native/npu/frame/OpParamMaker.h @@ -18,8 +18,7 @@ #include #include -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/NpuUtils.h" +#include "ATen/native/npu/frame/NPUDefine.h" #include "ATen/native/npu/interface/Graph.h" #include "c10/npu/NPUStream.h" @@ -31,120 +30,29 @@ namespace npu { // class OpAttrMaker { public: - static void Set(aclopAttr* attr, string name, bool value) { - aclopSetAttrBool(attr, name.c_str(), value); - } - - static void Set(aclopAttr* attr, string name, int64_t value) { - aclopSetAttrInt(attr, name.c_str(), value); - } - - static void Set(aclopAttr* attr, string name, float value) { - aclopSetAttrFloat(attr, name.c_str(), value); - } - - static void Set(aclopAttr* attr, string name, string value) { - aclopSetAttrString(attr, name.c_str(), value.c_str()); - } - - static void Set(aclopAttr* attr, string name, IntArrayRef value) { - auto vec = value.vec(); - aclopSetAttrListInt(attr, name.c_str(), vec.size(), vec.data()); - } - - static void Set(aclopAttr* attr, string name, at::ArrayRef value) { - auto vec = value.vec(); - aclopSetAttrListFloat(attr, name.c_str(), vec.size(), vec.data()); - } - - static void Set(aclopAttr* attr, string name, Scalar value) { - float val = CalcuOpUtil::get_scalar_float_value(value); - aclopSetAttrFloat(attr, name.c_str(), val); - } - + static void Set(aclopAttr* attr, string name, bool value); + static void Set(aclopAttr* attr, string name, int64_t value); + static void Set(aclopAttr* attr, string name, float value); + static void Set(aclopAttr* attr, string name, string value); + static void Set(aclopAttr* attr, string name, IntArrayRef value); + static void Set(aclopAttr* attr, string name, at::ArrayRef value); + static void Set(aclopAttr* attr, string name, Scalar value); static void Set( aclopAttr* attr, string name, - at::ArrayRef value) { - // Pointer to values of each listInt. - SmallVector attrValue; - // Pointer to number of each listInt. - SmallVector eachListIntNum; - // Value of each listInt. - SmallVector, N> eachListIntVal; - for (int i = 0; i < value.size(); i++) { - SmallVector listInt; - int64_t valueSize = value[i].size(); - listInt.resize(valueSize); - std::copy(value[i].begin(), value[i].end(), listInt.begin()); - eachListIntVal.emplace_back(listInt); - attrValue.emplace_back(eachListIntVal.back().data()); - eachListIntNum.emplace_back(valueSize); - } - - aclopSetAttrListListInt( - attr, - name.c_str(), - attrValue.size(), - eachListIntNum.data(), - attrValue.data()); - } + at::ArrayRef value); }; // class OpAttrMaker class AttrInfoMaker { public: - static void Add(bool value, string& attrInfo) { - attrInfo += to_string(value) + "-"; - } - - static void Add(int64_t value, string& attrInfo) { - attrInfo += to_string(value) + "-"; - } - - static void Add(float value, string& attrInfo) { - attrInfo += to_string(value) + "-"; - } - - static void Add(string value, string& attrInfo) { - attrInfo += value + "-"; - } - - static void Add(IntArrayRef value, string& attrInfo) { - auto vec = value.vec(); - for (unsigned i = 0; i < vec.size(); i++) - attrInfo += to_string(vec.at(i)) + ","; - attrInfo += "-"; - } - - static void Add( - at::ArrayRef value, - string& attrInfo) { - auto vec = value.vec(); - for (unsigned i = 0; i < vec.size(); i++) - attrInfo += to_string(vec.at(i)) + ","; - attrInfo += "-"; - } - - static void Add(Scalar value, string& attrInfo) { - float val = CalcuOpUtil::get_scalar_float_value(value); - attrInfo += to_string(val) + "-"; - } - - static void Add( - at::ArrayRef value, - string& attrInfo) { - // Pointer to values of each listInt. - SmallVector attrValue; - // Pointer to number of each listInt. - SmallVector eachListIntNum; - // Value of each listInt. - SmallVector, N> eachListIntVal; - for (int i = 0; i < value.size(); i++) { - int64_t valueSize = value[i].size(); - attrInfo += to_string(valueSize) + ","; - } - attrInfo += "-"; - } + static void Add(bool value, string& attrInfo); + static void Add(int64_t value, string& attrInfo); + static void Add(float value, string& attrInfo); + static void Add(string value, string& attrInfo); + static void Add(IntArrayRef value, string& attrInfo); + static void Add(at::ArrayRef value,string& attrInfo); + static void Add(Scalar value, string& attrInfo); + static void Add(at::ArrayRef value, string& attrInfo); }; // diff --git a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp index 8332005e50a493869c99778703935bb8b25b818f..1985cbffbbc309661fae435fda629967cf31f0ca 100644 --- a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp +++ b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp @@ -2,8 +2,10 @@ #include "c10/npu/register/OptionRegister.h" #include "c10/util/Exception.h" +#include "ATen/native/npu/utils/NpuFuzzyBlacklist.h" +#include "ATen/native/npu/utils/NpuProfilingDispatch.h" #include - +#include namespace at { namespace native { namespace npu { @@ -15,18 +17,51 @@ REGISTER_OPTION_BOOL_FUNCTION(AutoTuneEnabled, autotune, "disable", "enable") REGISTER_OPTION_INIT_BY_ENV(bmmv2_enable) REGISTER_OPTION_BOOL_FUNCTION(CheckBmmV2Enable, bmmv2_enable, "0", "1") -REGISTER_OPTION(ACL_OP_DEBUG_LEVEL) -REGISTER_OPTION(ACL_DEBUG_DIR) -REGISTER_OPTION(ACL_OP_COMPILER_CACHE_MODE) -REGISTER_OPTION(ACL_OP_COMPILER_CACHE_DIR) -REGISTER_OPTION(NPU_FUZZY_COMPILE_BLACKLIST) - REGISTER_OPTION_HOOK(mdldumpswitch, [](const std::string& val) { - if (val == "init") { aclmdlInitDump(); } - else if (val == "finalize") { aclmdlFinalizeDump(); } - else { TORCH_CHECK(0, "set initdump value only support init or finalize, but got ", val); } + if (val == "enable") { aclmdlInitDump(); } + else { aclmdlFinalizeDump(); } }) REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string& val) { aclmdlSetDump(val.c_str()); }) + +REGISTER_OPTION_HOOK(fuzzycompileswitch, [](const std::string& val) { + if (val == "enable") { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ); } + else { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT); } + }) +REGISTER_OPTION_BOOL_FUNCTION(CheckFuzzyEnable, fuzzycompileswitch, "disable", "enable") + +REGISTER_OPTION_HOOK(ACL_OP_DEBUG_LEVEL, [](const std::string& val) { + aclSetCompileopt(aclCompileOpt::ACL_OP_DEBUG_LEVEL, val.c_str()); + }) +REGISTER_OPTION_HOOK(ACL_DEBUG_DIR, [](const std::string& val) { + aclSetCompileopt(aclCompileOpt::ACL_DEBUG_DIR, val.c_str()); + }) +REGISTER_OPTION_HOOK(ACL_OP_COMPILER_CACHE_MODE, [](const std::string& val) { + aclSetCompileopt(aclCompileOpt::ACL_OP_COMPILER_CACHE_MODE, val.c_str()); + }) +REGISTER_OPTION_HOOK(ACL_OP_COMPILER_CACHE_DIR, [](const std::string& val) { + aclSetCompileopt(aclCompileOpt::ACL_OP_COMPILER_CACHE_DIR, val.c_str()); + }) +REGISTER_OPTION_HOOK(NPU_FUZZY_COMPILE_BLACKLIST, [](const std::string& val) { + if (CheckFuzzyEnable()) { + FuzzyCompileBlacklist::GetInstance().RegisterBlacklist(val); + } + }) + + REGISTER_OPTION_INIT_BY_ENV(PROFILING_MODE) + REGISTER_OPTION_BOOL_FUNCTION(CheckProfilingEnable, PROFILING_MODE, "false", "true"); + + REGISTER_OPTION_HOOK(deliverswitch, [](const std::string& val) { + TORCH_CHECK( + CheckProfilingEnable(), + "before you prepare to deliver op, ", + "you should be enture profiling mode is on correctly!"); + if (val == "enable"){ + at::native::npu::NpuProfilingDispatch::Instance().start(); + } else { + at::native::npu::NpuProfilingDispatch::Instance().stop(); + } + }) + } // namespace env } // namespace npu } // namespace native diff --git a/src/aten/src/ATen/native/npu/interface/EnvVariables.h b/src/aten/src/ATen/native/npu/interface/EnvVariables.h index d17617d41601bdd6a6229341a1af96c79193134f..bfb3c057e73c465e4e5b62ffc78fc740bc59de96 100644 --- a/src/aten/src/ATen/native/npu/interface/EnvVariables.h +++ b/src/aten/src/ATen/native/npu/interface/EnvVariables.h @@ -26,6 +26,8 @@ namespace env { */ bool AutoTuneEnabled(); bool CheckBmmV2Enable(); +bool CheckFuzzyEnable(); +bool CheckProfilingEnable(); } // namespace env } // namespace npu diff --git a/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossNpu.cpp b/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossKernelNpu.cpp similarity index 40% rename from src/aten/src/ATen/native/npu/loss/MultilabelMarginLossNpu.cpp rename to src/aten/src/ATen/native/npu/loss/MultilabelMarginLossKernelNpu.cpp index 3ba477702b576db92f76c8434fd97f0e8c17b2d3..be7343ee187bc74d87861ba17b529cc55a502154 100644 --- a/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossNpu.cpp +++ b/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossKernelNpu.cpp @@ -12,50 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "ATen/native/npu/utils/OpAdapter.h" #include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" namespace at { namespace native { using namespace at::native::npu; -SmallVector multilabel_margin_loss_npu_input( - const SmallVector& inputTensor) { - return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor); -} - -SmallVector multilabel_margin_loss_npu_output( - const SmallVector& outputTensor) { - return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor); -} - -SmallVector multilabel_margin_loss_npu_attr(int64_t reduction) { - string reductionStr; - if (reduction == Reduction::None) { - reductionStr = "none"; - } else if (reduction == Reduction::Mean) { - reductionStr = "mean"; - } else if (reduction == Reduction::Sum) { - reductionStr = "sum"; - } - - NPUAttrDesc npuAttrReduction = NPUAttrDesc("reduction", reductionStr); - SmallVector attrs = {npuAttrReduction}; - - return attrs; -} - std::tuple multilabel_margin_loss_forward_out_npu( Tensor& output, Tensor& is_target, const Tensor& self, const Tensor& target, int64_t reduction) { - auto inputs = multilabel_margin_loss_npu_input({self, target}); - auto outputs = multilabel_margin_loss_npu_output({output, is_target}); - auto attrs = multilabel_margin_loss_npu_attr(reduction); - CalcuOpUtil::execute_npu_operate("MultilabelMarginLoss", inputs, outputs, attrs); + + string reductionStr = CalcuOpUtil::get_reduction_str(reduction); + OpCommand cmd; + cmd.Name("MultilabelMarginLoss") + .Input(self) + .Input(target) + .Output(output) + .Output(is_target) + .Attr("reduction", reductionStr) + .Run(); + return std::tuple(output, is_target); } @@ -79,39 +59,13 @@ std::tuple multilabel_margin_loss_forward_npu( outputSize = {nframe}; } - auto output = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - auto is_target = at::empty_with_format(target.sizes(), target.options(), CalcuOpUtil::get_tensor_npu_format(target)); + auto output = OpPreparation::ApplyTensor(self, outputSize); + auto is_target = OpPreparation::ApplyTensor(target); multilabel_margin_loss_forward_out_npu( output, is_target, self, target, reduction); return std::make_tuple(output, is_target); } -Tensor& multilabel_margin_loss_backward_npu_out( - Tensor& grad_input, - const Tensor& grad_output, - const Tensor& self, - const Tensor& target, - int64_t reduction, - const Tensor& is_target) { - auto inputs = multilabel_margin_loss_npu_input({self, grad_output, target, is_target}); - auto outputs = multilabel_margin_loss_npu_output({grad_input}); - auto attrs = multilabel_margin_loss_npu_attr(reduction); - CalcuOpUtil::execute_npu_operate("MultilabelMarginLossGrad", inputs, outputs, attrs); - return grad_input; -} - -Tensor multilabel_margin_loss_backward_npu( - const Tensor& grad_output, - const Tensor& self, - const Tensor& target, - int64_t reduction, - const Tensor& is_target) { - auto grad_input = at::empty_with_format(self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - multilabel_margin_loss_backward_npu_out( - grad_input, grad_output, self, target, reduction, is_target); - return grad_input; -} - } // namespace native } // namespace at diff --git a/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp index fa7fb25a6ca3bde8f366294bfeacce0e3c10c8f3..9fd7906afa36fb80ea6784eac37706e4e97900af 100644 --- a/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -73,6 +74,7 @@ tuple nll_loss2d_forward_out_npu( .Input(target) .Input(weight_tensor) .Attr("reduction", reductionStr) + .Attr("ignore_index", ignore_index) .Output(result) .Output(total_weight) .Run(); diff --git a/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp index 7bddf995a685651b777df505a073375d8930a1d4..a1da5294f1173a07ad75a5b27c7d64e7f17d247e 100644 --- a/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -52,8 +52,15 @@ Tensor& nll_loss_backward_out_npu( string reductionStr = CalcuOpUtil::get_reduction_str(reduction); Tensor targetCast = target; - if (target.scalar_type() == at::kLong || target.scalar_type() == at::kFloat) { + auto scalar_type = target.scalar_type(); + if (scalar_type == at::kLong) { targetCast = target.to(at::kInt); + } else if (scalar_type == at::kInt) { + ; + } + else { + AT_ERROR("Expected object of scalar type ", at::kLong, " or ", at::kInt, " but got scalar type ", scalar_type, + " for argument 'target' in call to nll_loss_backward"); } OpCommand cmd; diff --git a/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp b/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp index 5268ef88356d433b8ed01ea3c208910a0b0a0b1c..10d4093cc378a8ed848ccb8dd277e71e06b5b3a9 100644 --- a/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -51,9 +51,16 @@ tuple nll_loss_forward_out_npu( string reductionStr = CalcuOpUtil::get_reduction_str(reduction); Tensor targetCast = target; - if (target.scalar_type() == at::kLong || target.scalar_type() == at::kFloat) { + auto scalar_type = target.scalar_type(); + if (scalar_type == at::kLong) { targetCast = target.to(at::kInt); - } + } else if (scalar_type == at::kInt) { + ; + } + else { + AT_ERROR("Expected object of scalar type ", at::kLong, " or ", at::kInt, " but got scalar type ", scalar_type, + " for argument 'target' in call to nll_loss_forward"); + } OpCommand cmd; cmd.Name("NLLLoss") diff --git a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp index 9682af4fce8a38e7ad5d090be8ab0ff7fb6bb6ee..4bf6a9838141bb14fd8c631f626cf1437a9b3602 100644 --- a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp @@ -14,8 +14,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -58,16 +58,8 @@ Tensor& adaptive_avg_pool2d_backward_out_npu( Tensor adaptive_avg_pool2d_backward_npu( const Tensor& grad_output, const Tensor& self) { - // calculate the output size - auto outputSize = input_same_output_size(self); - - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU + Tensor result = OpPreparation::ApplyTensor(self); adaptive_avg_pool2d_backward_out_npu(result, grad_output, self); - return result; } diff --git a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp index f37a1f2744533e668811a3128e427846b172379b..077f0ece2be6661a193a1789fa0851b48cb27469 100644 --- a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp @@ -15,8 +15,8 @@ // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" namespace at { namespace native { @@ -51,12 +51,7 @@ Tensor& adaptive_avg_pool3d_backward_out_npu( } Tensor adaptive_avg_pool3d_backward_npu(const Tensor& grad_output, const Tensor& self){ - // calcul the output size - auto outputsize = input_same_output_size(self); - - Tensor result = at::empty_with_format( - outputsize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self); adaptive_avg_pool3d_backward_out_npu(result, grad_output, self); return result; } diff --git a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp index 9bf50adb1a311f8a642668477f80c053418ca368..aae01114039912bbbeb7f35270fc8362492c9264 100644 --- a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp @@ -14,8 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { @@ -110,9 +109,7 @@ Tensor avg_pool2d_npu( divisor_override); // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - outputSizes, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - + Tensor result = OpPreparation::ApplyTensor(self, outputSizes); // calculate the output result of the NPU avg_pool2d_out_npu( result, diff --git a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp index a1374c3bf15a4c86f5742b4aeeb0f11a8ab06f18..5ffa6dc9e4327418977757fe22b474c563bb0df7 100644 --- a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include namespace at { diff --git a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp index 47e2bb0e091ef1e2157c07f5849293fb3533542f..a76bf972b96c1e2e258194fe48b9e6f86beb41b3 100644 --- a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp @@ -15,6 +15,7 @@ // limitations under the License. #include "ATen/native/npu/utils/OpAdapter.h" +#include "ATen/native/npu/utils/CalcuOpUtil.h" #include namespace at { diff --git a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp index 33abf12e71ace29e2c19d3b5ac99df1b8002d19d..c5e2eba6ecc3bfdcad270781a578eb2eb01461e2 100644 --- a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp +++ b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp @@ -26,7 +26,7 @@ #include "c10/npu/NPUCachingAllocator.h" #include "c10/npu/OptionsManager.h" #include "ATen/native/npu/utils/NpuFuzzyBlacklist.h" -#include "ATen/native/GlobalStep.h" +#include "ATen/native/npu/interface/EnvVariables.h" namespace at { namespace native { @@ -602,7 +602,6 @@ void CalcuOpUtil::execute_npu_operate( SmallVector& inputs, SmallVector& outputs, const SmallVector& attrs) { - NpuUtils::SetCompileOptOnce(); if (c10::npu::OptionsManager::CheckQueueEnable() || c10::npu::OptionsManager::CheckDynamicEnable()) { ExecuteParas cur_paras; @@ -631,7 +630,7 @@ void CalcuOpUtil::execute_npu_operate( auto stream = c10::npu::getCurrentNPUStream(); RECORD_FUNCTION(opName, std::vector({})); bool reset_flag = false; - if (check_fuzz_enable() && + if (env::CheckFuzzyEnable() && FuzzyCompileBlacklist::GetInstance().IsInBlacklist(opName)) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT); reset_flag = true; diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp index 598c44b8715f34c3986fc2d8d9baf81931bf6fa9..150711feecf70d648b02167e36b80eb5f29d18a1 100644 --- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp +++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp @@ -200,23 +200,6 @@ SmallVector cdist_npu_output_size( return output_shape; } -SmallVector conv_tbc_npu_output_size( - const Tensor& self, - const Tensor& weight, - const Tensor& bias, - int64_t pad) { - int64_t N = self.size(1); - int64_t H = 1; - int64_t W = self.size(0); - int64_t Co = weight.size(2); - int64_t Ho = 1; - int64_t Wo = (W + 2 * pad - (weight.size(0) - 1) - 1) + 1; - - SmallVector outputSize = {N, Co, Ho, Wo}; - - return outputSize; -} - tuple> conv2d_backward_npu_output_size( const Tensor& input, @@ -267,7 +250,7 @@ SmallVector convolution_transpose_npu_output_size( int64_t N = input.size(0); int64_t H = input.size(2); int64_t W = input.size(3); - int64_t Co = weight.size(1); + int64_t Co = weight.size(1) * groups; auto kernel_size = weight.sizes().slice(2); int64_t Ho = (H - 1) * stride[0] - 2 * padding[0] + @@ -327,12 +310,7 @@ ctc_loss_npu_output_size( SmallVector negLogLikelihoodSize = {batchSize}; - // tSize = 2*max(target_lengths)+1 - int64_t maxLength = 0; - for(int i = 0; i < targetLengths.size(); i++) { - maxLength = targetLengths[i] > maxLength? targetLengths[i]: maxLength; - } - + int64_t maxLength = targets.size(1); int64_t tSize = 2 * maxLength + 1; SmallVector logAlphaSize = {batchSize, maxInputLength, tSize}; @@ -910,15 +888,6 @@ softmax_cross_entropy_with_logits_impl_npu_output_size(const Tensor& self) { resultSize, backpropSize); } -tuple, SmallVector> std_npu_output_size(const Tensor & self, IntArrayRef dim, bool keepdim) -{ - SmallVector outputSize; - SmallVector meanSize; - outputSize = reduce_ops_npu_output_size(self, dim, keepdim); - meanSize = reduce_ops_npu_output_size(self, dim, keepdim); - return tuple, SmallVector>(outputSize, meanSize); -} - SmallVector sum_npu_output_size( const Tensor& self, IntArrayRef dim, diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h index bf520b1ce13ebb32cc56d6dfcb3a06810ba118df..9290da7ddd91ee55d3e88cf46fc065973ab0a4be 100644 --- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h +++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h @@ -110,12 +110,6 @@ SmallVector cdist_npu_output_size( const Tensor& x1, const Tensor& x2); -SmallVector conv_tbc_npu_output_size( - const Tensor& self, - const Tensor& weight, - const Tensor& bias, - int64_t pad); - tuple> conv2d_backward_npu_output_size( const Tensor& input, @@ -352,9 +346,6 @@ SmallVector transpose_npu_output_size( tuple, SmallVector> softmax_cross_entropy_with_logits_impl_npu_output_size(const Tensor& self); -tuple, SmallVector> -std_npu_output_size(const Tensor & self, IntArrayRef dim, bool keepdim); - SmallVector sum_npu_output_size( const Tensor& self, IntArrayRef dim, diff --git a/src/aten/src/ATen/native/GlobalStep.cpp b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.cpp similarity index 32% rename from src/aten/src/ATen/native/GlobalStep.cpp rename to src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.cpp index c757140616817917f7f92c5e305fa676adc4db95..a6a98f7aa32bfea206b0b77b526baf55139b7b91 100644 --- a/src/aten/src/ATen/native/GlobalStep.cpp +++ b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.cpp @@ -14,59 +14,55 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "GlobalStep.h" -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include +#include "NpuProfilingDispatch.h" +#include +#include namespace at { namespace native { +namespace npu { -GlobalStep& GlobalStep::Instance() { - static GlobalStep globalStep(0, 1); - return globalStep; +NpuProfilingDispatch& NpuProfilingDispatch::Instance(){ + static NpuProfilingDispatch npuProfilingDispatch; + return npuProfilingDispatch; } -void GlobalStep::GlobalStepInc() { - GLOBAL_STEP++; +void NpuProfilingDispatch::init(){ + profStepInfo = c10::npu::acl::init_stepinfo(); } -int64_t GlobalStep::GetGlobalStep() const { - return GLOBAL_STEP; +void NpuProfilingDispatch::start(){ + this->init(); + auto stream = c10::npu::getCurrentNPUStream(); + auto ret = c10::npu::acl::start_deliver_op( + profStepInfo, + aclprofStepTag::ACL_STEP_START, + stream); + if(ret != ACL_ERROR_NONE){ + NPU_LOGE("npu profiling start fail, error code: %d", ret); + C10_NPU_SHOW_ERR_MSG(); + } } -void GlobalStep::SetStartFuzzCompileStep(const int64_t step) { - START_FUZZ_COMPILE_STEP = step; +void NpuProfilingDispatch::stop(){ + auto stream = c10::npu::getCurrentNPUStream(); + auto ret = c10::npu::acl::stop_deliver_op( + profStepInfo, + aclprofStepTag::ACL_STEP_END, + stream); + if(ret != ACL_ERROR_NONE){ + NPU_LOGE("npu profiling stop fail, error code: %d", ret); + C10_NPU_SHOW_ERR_MSG(); + } + this->destroy(); } -int64_t GlobalStep::GetStartFuzzCompileStep() const { - return START_FUZZ_COMPILE_STEP; +void NpuProfilingDispatch::destroy(){ + if(profStepInfo != nullptr){ + c10::npu::acl::destroy_stepinfo(profStepInfo); + } } -TORCH_NPU_API bool check_fuzz_enable(){ - int64_t globalstep = GlobalStep::Instance().GetGlobalStep(); - int64_t globalstartstep = GlobalStep::Instance().GetStartFuzzCompileStep(); - - return (globalstep >= globalstartstep); -} - -void global_step_inc() { - #ifdef USE_NPU - GlobalStep::Instance().GlobalStepInc(); - // To invoke the interface only once, check whether the GLOBAL_STEP equal to START_FUZZ_COMPILE_STEP is OK. - if(GlobalStep::Instance().GetGlobalStep() == GlobalStep::Instance().GetStartFuzzCompileStep()) { - NPU_LOGD("GLOBAL_STEP = %ld, START_FUZZ_COMPILE_STEP = %ld, start fuzz compile!", - GlobalStep::Instance().GetGlobalStep(), GlobalStep::Instance().GetStartFuzzCompileStep()); - - aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ); - } - #endif } - -void set_start_fuzz_compile_step(int64_t step) { - #ifdef USE_NPU - GlobalStep::Instance().SetStartFuzzCompileStep(step); - #endif -} - } } diff --git a/src/aten/src/ATen/native/GlobalStep.h b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.h similarity index 56% rename from src/aten/src/ATen/native/GlobalStep.h rename to src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.h index ad38c6cf19b0ceff140e85647f08a1198efa1bcf..b455ad3704c5f90bdf8d1ead7d2d0a89768f81f8 100644 --- a/src/aten/src/ATen/native/GlobalStep.h +++ b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.h @@ -13,32 +13,32 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#pragma once -#include -#include +#ifndef __NPU_PROFILING_DISPATCH__ +#define __NPU_PROFILING_DISPATCH__ + +#include namespace at { namespace native { +namespace npu { -class GlobalStep +class NpuProfilingDispatch { - public: - static GlobalStep& Instance(); - void GlobalStepInc(); - int64_t GetGlobalStep() const; - void SetStartFuzzCompileStep(const int64_t step); - int64_t GetStartFuzzCompileStep() const; - ~GlobalStep() = default; - - private: - int64_t GLOBAL_STEP; - int64_t START_FUZZ_COMPILE_STEP; - GlobalStep(int64_t globalstep, int64_t startstep) { - GLOBAL_STEP = globalstep; - START_FUZZ_COMPILE_STEP = startstep; - } + public: + static NpuProfilingDispatch& Instance(); + void start(); + void stop(); + private: + aclprofStepInfo* profStepInfo = nullptr; + NpuProfilingDispatch() = default; + ~NpuProfilingDispatch() = default; + void init(); + void destroy(); }; -TORCH_NPU_API bool check_fuzz_enable(); + } -} \ No newline at end of file +} +} + +#endif // __NPU_PROFILING_DISPATCH__ \ No newline at end of file diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp index c8379a959aa4a47fad610ee4795cbe3d92184ad7..daa10e3ab1c6e331dc08102e8d7894a3a2ba8f16 100644 --- a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp +++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp @@ -24,43 +24,12 @@ #include "ATen/native/npu/frame/StorageDescHelper.h" #include "KernelNpuOutputSize.h" #include -#include "NpuFuzzyBlacklist.h" -#include "ATen/native/GlobalStep.h" +#include "ATen/native/npu/interface/EnvVariables.h" #include namespace at { namespace native { namespace npu { -namespace{ - std::once_flag CompileOptOnceFlag; -} - -void NpuUtils::SetCompileOptOnce() { - std::call_once(CompileOptOnceFlag, [](){ - static std::map STRING_TYPE_TO_ACL_COMPILE_OPTION_MAP = { - {"ACL_OP_DEBUG_LEVEL", ACL_OP_DEBUG_LEVEL}, - {"ACL_DEBUG_DIR", ACL_DEBUG_DIR}, - {"ACL_OP_COMPILER_CACHE_MODE", ACL_OP_COMPILER_CACHE_MODE}, - {"ACL_OP_COMPILER_CACHE_DIR", ACL_OP_COMPILER_CACHE_DIR}, - }; - for (const auto &iter : STRING_TYPE_TO_ACL_COMPILE_OPTION_MAP) { - auto key = iter.second; - auto val = c10::npu::GetOption(iter.first); - if (val.has_value()) { - aclSetCompileopt(key, val.value().c_str()); - } - } - static std::set STRING_COMPILE_OPTION_SET = { - {"NPU_FUZZY_COMPILE_BLACKLIST"}, - }; - for (const auto &iter : STRING_COMPILE_OPTION_SET) { - auto val = c10::npu::GetOption(iter); - if(check_fuzz_enable() && val.has_value()) - FuzzyCompileBlacklist::GetInstance().RegisterBlacklist(val.value()); - } - - }); -} void NpuUtils::format_fresh_view( Tensor& x, diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.h b/src/aten/src/ATen/native/npu/utils/NpuUtils.h index dc33b853df3daf2b37d89551fa7ba8d770696c8c..d9797e289977defac21ded7f2ed0793debf6ec5c 100644 --- a/src/aten/src/ATen/native/npu/utils/NpuUtils.h +++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.h @@ -49,10 +49,7 @@ typedef enum MemoryType{ class NpuUtils { public: - /** - This API is used to set compile option. - */ - CAFFE2_API static void SetCompileOptOnce(); + static bool check_match(const Tensor* tensor); static Tensor format_contiguous(const Tensor& src); static Tensor format_contiguous_add_copy_optimize(const Tensor& src); diff --git a/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h b/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h index 8e0fde12e4496af33224ad177d39dbd6183b9316..689143c8ee5ab64579de6a1696bb622b82d85bdd 100644 --- a/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h +++ b/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h @@ -116,8 +116,7 @@ class OpPipeWithMultiOut { OpPreparation::CheckOut( inputs, std::get(this->funcParams), - CalcuOpUtil::get_tensor_npu_format(src), - src.scalar_type(), + src, size); return *this; } diff --git a/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp b/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp index 2c6cd8d666dc2f640d43e6113a3a284384449a7a..0d256094c7c169edacb7851c5f0d23dbeab0f8a0 100644 --- a/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp +++ b/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp @@ -110,6 +110,19 @@ void OpPreparation::CheckOut( dst.sizes()); } +void OpPreparation::CheckOut( + const std::initializer_list& inputs, + Tensor& output, + Tensor dst, + IntArrayRef shape) { + CheckOut( + inputs, + output, + CalcuOpUtil::get_tensor_npu_format(dst), + dst.scalar_type(), + shape); +} + void OpPreparation::CheckOut( const std::initializer_list& input, Tensor& output, diff --git a/src/aten/src/ATen/native/npu/utils/OpPreparation.h b/src/aten/src/ATen/native/npu/utils/OpPreparation.h index 0f11af4270fa19d39839197f106a2f611c511708..02bc66f3f322dc56f066c26cc8d8dc0cd7de9d84 100644 --- a/src/aten/src/ATen/native/npu/utils/OpPreparation.h +++ b/src/aten/src/ATen/native/npu/utils/OpPreparation.h @@ -54,6 +54,10 @@ public: static void CheckOut( const std::initializer_list& inputs, Tensor& output, Tensor dst); + static void CheckOut( + const std::initializer_list& inputs, + Tensor& output, Tensor dst, + IntArrayRef shape); static void CheckOut( const std::initializer_list& input, Tensor& output, int64_t format, diff --git a/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp b/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp index a57145af965ed1ac6ac919c745918899bb1b2896..cd80ef4b050b00216d704c3ce228c138f3c3d0bf 100644 --- a/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp +++ b/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp @@ -14,7 +14,7 @@ // limitations under the License. #include "OpTemplate.h" -#include "ATen/native/GlobalStep.h" +#include "ATen/native/npu/interface/EnvVariables.h" #include "ATen/native/npu/frame/OpCmdHelper.h" #include "ATen/native/npu/frame/FormatHelper.h" #include "ATen/native/npu/frame/OpParamMaker.h" @@ -36,7 +36,7 @@ TransDataOpCommand& TransDataOpCommand::AddInputAndOutput(const Tensor& input, c std::tuple in; std::tuple out; - if (!c10::npu::OptionsManager::CheckDynamicEnable() && check_fuzz_enable()) { + if (!c10::npu::OptionsManager::CheckDynamicEnable() && env::CheckFuzzyEnable()) { in = OpCmdHelper::CovertTensorToAclInput(input, c10::nullopt, "", ""); out = OpCmdHelper::CovertTensorToAclInput(output, c10::nullopt, "", ""); } else { diff --git a/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp b/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp index e2c9a5d18620057b793c33c318378194351d54a1..c7846701dfa81788ea3236bdbd90e1942ff3cca4 100644 --- a/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp +++ b/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp @@ -171,13 +171,17 @@ struct HostAllocator { while (!npu_events.empty()) { auto& e = npu_events.front(); aclrtEvent event = e.first; - aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; - aclError err = aclrtQueryEvent(event, &status); - if (status == ACL_EVENT_STATUS_NOT_READY) { + c10::npu::acl::aclrtEventWaitStatus waitStatus = c10::npu::acl::ACL_EVENT_WAIT_STATUS_RESERVED; + aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED; + aclError err = c10::npu::acl::AclQueryEventStatus(event, &waitStatus, &recordStatus); + if (err != ACL_ERROR_NONE) { + return err; + } + if ((waitStatus != c10::npu::acl::ACL_EVENT_WAIT_STATUS_COMPLETE) && + (recordStatus != ACL_EVENT_STATUS_COMPLETE)) { break; - } else if (err != ACL_ERROR_NONE) { - return err; } + err = aclrtDestroyEvent(event); if (err != ACL_ERROR_NONE) { return err; diff --git a/src/c10/npu/NPUCachingAllocator.cpp b/src/c10/npu/NPUCachingAllocator.cpp index a9f6cc911a4a2a2f554b14127e2019d21c094508..f179b6b23d50fbe7d6e31cc2c8109259305d78e9 100644 --- a/src/c10/npu/NPUCachingAllocator.cpp +++ b/src/c10/npu/NPUCachingAllocator.cpp @@ -891,13 +891,15 @@ struct THNCachingAllocator { aclrtEvent event = e.first; Block* block = e.second; - aclrtEventStatus status; - aclError err = aclrtQueryEvent(event, &status); - if (status == ACL_EVENT_STATUS_NOT_READY) { - // ignore if not ready + acl::aclrtEventWaitStatus waitStatus = acl::ACL_EVENT_WAIT_STATUS_RESERVED; + aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED; + aclError err = acl::AclQueryEventStatus(event, &waitStatus, &recordStatus); + if (err != ACL_ERROR_NONE) { + C10_NPU_CHECK(err); + } + if ((waitStatus != acl::ACL_EVENT_WAIT_STATUS_COMPLETE) && + (recordStatus != ACL_EVENT_STATUS_COMPLETE)) { break; - } else if (err != ACL_ERROR_NONE) { - C10_NPU_CHECK(err); } aclrtDestroyEvent(event); diff --git a/src/c10/npu/NPUEventManager.cpp b/src/c10/npu/NPUEventManager.cpp index a59cb2f95cf332aae1eb52321c789aafce5674e1..d847472eb1a926f90151d23763dd3f3b8d13aba0 100644 --- a/src/c10/npu/NPUEventManager.cpp +++ b/src/c10/npu/NPUEventManager.cpp @@ -28,13 +28,16 @@ aclError NPUEventManager::LazyDestroy(aclrtEvent npu_event) { while (!npu_events_.empty()) { aclrtEvent event = npu_events_.front(); - aclrtEventStatus status; - aclError err = aclrtQueryEvent(event, &status); - if (status != ACL_EVENT_STATUS_COMPLETE) { - break; - } else if (err != ACL_ERROR_NONE) { + acl::aclrtEventWaitStatus waitStatus = acl::ACL_EVENT_WAIT_STATUS_RESERVED; + aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED; + aclError err = acl::AclQueryEventStatus(event, &waitStatus, &recordStatus); + if (err != ACL_ERROR_NONE) { return err; } + if ((waitStatus != acl::ACL_EVENT_WAIT_STATUS_COMPLETE) && + (recordStatus != ACL_EVENT_STATUS_COMPLETE)) { + break; + } err = aclrtDestroyEvent(event); if (err != ACL_ERROR_NONE) { return err; diff --git a/src/c10/npu/NPUStream.cpp b/src/c10/npu/NPUStream.cpp index baa1ebf6c510ce6883abc5b35227742f18e3606d..d9a54eb713851da95428a96eee2fcbc0897fbbff 100644 --- a/src/c10/npu/NPUStream.cpp +++ b/src/c10/npu/NPUStream.cpp @@ -63,7 +63,7 @@ struct LeakyStreamInternals { // Global stream state and constants static DeviceIndex num_npus = -1; -static constexpr int kStreamsPerPoolBits = 5; +static constexpr int kStreamsPerPoolBits = 3; static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits; // static constexpr unsigned int kDefaultFlags = npuStreamNonBlocking; diff --git a/src/c10/npu/OptionsManager.h b/src/c10/npu/OptionsManager.h index 62e3e156290361d9415d4aed10a845125db7fdf4..348f63b4355a5430c4c87684dfdc675e2612587e 100644 --- a/src/c10/npu/OptionsManager.h +++ b/src/c10/npu/OptionsManager.h @@ -38,7 +38,6 @@ class OptionsManager { static bool CheckUseNpuLogEnable(); static bool CheckDynamicOnly(); static std::string CheckDisableDynamicPath(); - private: static int GetBoolTypeOption(const char* env_str); }; diff --git a/src/c10/npu/interface/AclInterface.cpp b/src/c10/npu/interface/AclInterface.cpp index 3121ae3f625a0a0e605392991da8d0b37973d347..84152a66361bf618980524c5825a3ac1016df709 100644 --- a/src/c10/npu/interface/AclInterface.cpp +++ b/src/c10/npu/interface/AclInterface.cpp @@ -17,7 +17,6 @@ #include "AclInterface.h" #include "c10/npu/register/FunctionLoader.h" #include "c10/util/Exception.h" -#include namespace c10 { namespace npu { @@ -32,6 +31,54 @@ namespace acl { REGISTER_LIBRARY(libascendcl) LOAD_FUNCTION(aclGetRecentErrMsg) LOAD_FUNCTION(aclrtCreateEventWithFlag) +LOAD_FUNCTION(aclrtQueryEventWaitStatus) +LOAD_FUNCTION(aclprofCreateStepInfo) +LOAD_FUNCTION(aclprofGetStepTimestamp) +LOAD_FUNCTION(aclprofDestroyStepInfo) + +aclprofStepInfoPtr init_stepinfo(){ + typedef aclprofStepInfoPtr(*npdInitFunc)(); + static npdInitFunc func = nullptr; + if(func == nullptr){ + func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo); + } + TORCH_CHECK(func, "Failed to find function ", "aclprofCreateStepInfo"); + auto ret = func(); + return ret; +} + +NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo){ + typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr); + static npdDestroyFunc func = nullptr; + if(func == nullptr){ + func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo); + } + TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyStepInfo"); + auto ret = func(stepInfo); + return ret; +} + +NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream){ + typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream); + static npdStartProfiling func = nullptr; + if(func == nullptr){ + func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp); + } + TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp"); + auto ret = func(stepInfo, stepTag, stream); + return ret; +} + +NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream){ + typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream); + static npdStopProfiling func = nullptr; + if(func == nullptr){ + func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp); + } + TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp"); + auto ret = func(stepInfo, stepTag, stream); + return ret; +} const char *AclGetErrMsg() { @@ -56,6 +103,19 @@ aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag) { return func(event, flag); } +aclError AclQueryEventStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus, aclrtEventStatus *recordStatus) +{ + typedef aclError (*aclQueryEventWaitStatus)(aclrtEvent event, aclrtEventWaitStatus *status); + static aclQueryEventWaitStatus func = nullptr; + if (func == nullptr) { + func = (aclQueryEventWaitStatus)GET_FUNC(aclrtQueryEventWaitStatus); + } + if (func != nullptr) { + return func(event, waitStatus); + } else { + return aclrtQueryEvent(event, recordStatus); + } +} } // namespace acl } // namespace npu } // namespace c10 diff --git a/src/c10/npu/interface/AclInterface.h b/src/c10/npu/interface/AclInterface.h index 9a8a19549752498a98dffe6c3ff24934bb2daef0..7a9a5a4f532ef36d57ab120fd80d82a15093d4ed 100644 --- a/src/c10/npu/interface/AclInterface.h +++ b/src/c10/npu/interface/AclInterface.h @@ -12,11 +12,47 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include + +#ifndef __C10_NPU_INTERFACE_ACLINTERFACE__ +#define __C10_NPU_INTERFACE_ACLINTERFACE__ + +#include "third_party/acl/inc/acl/acl_rt.h" +#include namespace c10 { namespace npu { namespace acl { +typedef enum aclrtEventWaitStatus { + ACL_EVENT_WAIT_STATUS_COMPLETE = 0, + ACL_EVENT_WAIT_STATUS_NOT_READY = 1, + ACL_EVENT_WAIT_STATUS_RESERVED = 0xffff, +} aclrtEventWaitStatus; + +/** + aclprofStepInfo is provide by acl, it used to be store dispatch op info. + */ +using aclprofStepInfoPtr = aclprofStepInfo *; +/** + NpdStatus is provide by acl, it used to store the return value. + */ +using NpdStatus = int; + +/** + This Api is used to init npd, it need to be called once at process. + */ +aclprofStepInfoPtr init_stepinfo(); +/** + This Api is used to destroy npd, it need to be called once at process. + */ +NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo); +/** + This Api is used to start dispatch op, this operation should be called after init. + */ +NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream); +/** + This Api is used to stop dispatch op, this operation should be called after start dispatch op. + */ +NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream); /** This API is used to get error msg @@ -33,6 +69,13 @@ const char *AclGetErrMsg(); * @retval OtherValues Failure */ aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag); + +/** + This API is used to query status of event task + */ +aclError AclQueryEventStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus, aclrtEventStatus *recordStatus); } // namespace acl } // namespace npu -} // namespace c10 \ No newline at end of file +} // namespace c10 + +#endif // __C10_NPU_INTERFACE_ACLINTERFACE__ \ No newline at end of file diff --git a/src/c10/npu/register/FunctionLoader.cpp b/src/c10/npu/register/FunctionLoader.cpp index d2dd63ce6122b897489ec0577b530b5e930b8016..0732476ec354ab1dfd5e35fd514e7f69ebb18b9c 100644 --- a/src/c10/npu/register/FunctionLoader.cpp +++ b/src/c10/npu/register/FunctionLoader.cpp @@ -57,7 +57,6 @@ void* FunctionLoader::Get(const std::string& name) { auto func = dlsym(this->handle, name.c_str()); if (func == nullptr) { - AT_ERROR(dlerror()); return nullptr; } this->registry[name] = func; diff --git a/src/third_party/acl/inc/acl/acl.h b/src/third_party/acl/inc/acl/acl.h index 50ebd624e47ddcc67d6d2b2d9edbdb0c5e9b59f8..41db19178ba11228eecf3fc993e7353b9e6fbb1f 100644 --- a/src/third_party/acl/inc/acl/acl.h +++ b/src/third_party/acl/inc/acl/acl.h @@ -60,15 +60,6 @@ ACL_FUNC_VISIBILITY aclError aclFinalize(); */ ACL_FUNC_VISIBILITY aclError aclrtGetVersion(int32_t *majorVersion, int32_t *minorVersion, int32_t *patchVersion); -/** - * @ingroup AscendCL - * @brief get recent error message - * - * @retval null for failed - * @retval OtherValues success -*/ -ACL_FUNC_VISIBILITY const char *aclGetRecentErrMsg(); - #ifdef __cplusplus } #endif diff --git a/src/third_party/acl/inc/acl/acl_base.h b/src/third_party/acl/inc/acl/acl_base.h index 7509784c9875ec4792cddc53b9a9680ec1872416..f7a77000f5d0554d6f804fabe5adce3bc64924d4 100644 --- a/src/third_party/acl/inc/acl/acl_base.h +++ b/src/third_party/acl/inc/acl/acl_base.h @@ -31,7 +31,7 @@ typedef int aclError; typedef uint16_t aclFloat16; typedef struct aclDataBuffer aclDataBuffer; typedef struct aclTensorDesc aclTensorDesc; - +typedef struct aclprofStepInfo aclprofStepInfo; static const int ACL_ERROR_NONE = 0; static const int ACL_ERROR_INVALID_PARAM = 100000; @@ -146,6 +146,11 @@ typedef enum { ACL_MEMTYPE_HOST = 1, } aclMemType; +typedef enum { + ACL_STEP_START = 0, + ACL_STEP_END = 1, +} aclprofStepTag; + /** * @ingroup AscendCL * @brief Converts data of type aclFloat16 to data of type float @@ -498,7 +503,6 @@ ACL_FUNC_VISIBILITY void aclAppLog(aclLogLevel logLevel, const char *func, const const char *fmt, ...); - ACL_FUNC_VISIBILITY aclError aclSetTensorPlaceMent(aclTensorDesc *desc, aclMemType type); #define ACL_APP_LOG(level, fmt, ...) \ diff --git a/src/third_party/acl/libs/acl.cpp b/src/third_party/acl/libs/acl.cpp index cf652f7e9ae06a82082de37a240f0b8ca95bbcae..e517148e2507aeb0620a061b26a59a2981150a2e 100644 --- a/src/third_party/acl/libs/acl.cpp +++ b/src/third_party/acl/libs/acl.cpp @@ -67,6 +67,4 @@ aclFormat aclGetTensorDescFormat(const aclTensorDesc *desc) {return ACL_FORMAT_N const char *aclGetTensorDescName(aclTensorDesc *desc) {return NULL;} aclError aclSetTensorPlaceMent(aclTensorDesc *desc, aclMemType type) {return 0;}; - -const char *aclGetRecentErrMsg() {return NULL;} } diff --git a/src/tools/autograd/derivatives.yaml b/src/tools/autograd/derivatives.yaml index 90f11a38cf7c465cfcbabad70120225e7c2bda2f..046aad5032c2ef0e38c53ab8b859e2703fd5cf9d 100644 --- a/src/tools/autograd/derivatives.yaml +++ b/src/tools/autograd/derivatives.yaml @@ -1679,9 +1679,9 @@ - name: npu_confusion_transpose(Tensor self, int[] perm, int[] shape, bool transpose_first) -> Tensor self: npu_confusion_transpose_backward(grad, perm, self.sizes(), !transpose_first) -- name: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor - self: grad.npu_bmmV2(mat2.transpose(-2, -1)) - mat2: npu_bmmV2_mat2_backward(grad, self, mat2.sizes()) +- name: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor + self: npu_bmm_v2_mat1_backward(grad, self, mat2, self.sizes()) + mat2: npu_bmm_v2_mat2_backward(grad, self, mat2, mat2.sizes()) - name: npu_deformable_conv2d(Tensor input, Tensor weight, Tensor offset, Tensor? bias, int[2] kernel_size, int[] stride, int[] padding, int[] dilation=[1,1,1,1], int groups=1, int deformable_groups=1, bool modulated=True) -> (Tensor, Tensor) input, weight, offset, bias: npu_deformable_conv2dbk(input, grad, result1, weight, offset, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated) diff --git a/src/torch/csrc/autograd/profiler_npu.cpp b/src/torch/csrc/autograd/profiler_npu.cpp index 150a68fc40d5a09885530783dc8db2d60070f9b5..27a465ad6b9ea1ffa8290d59eaca5fdf53e3b297 100644 --- a/src/torch/csrc/autograd/profiler_npu.cpp +++ b/src/torch/csrc/autograd/profiler_npu.cpp @@ -36,9 +36,12 @@ static inline void npuCheck(aclError result, const char * file, int line) { struct NPUMethods : public CUDAStubs { void npu_destroy_event(aclrtEvent event) { - aclrtEventStatus status; - TORCH_NPU_CHECK(aclrtQueryEvent(event, &status)); - if (status == ACL_EVENT_STATUS_COMPLETE) { + c10::npu::acl::aclrtEventWaitStatus waitStatus = c10::npu::acl::ACL_EVENT_WAIT_STATUS_RESERVED; + aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED; + TORCH_NPU_CHECK(c10::npu::acl::AclQueryEventStatus(event, &waitStatus, &recordStatus)); + + if ((waitStatus == c10::npu::acl::ACL_EVENT_WAIT_STATUS_COMPLETE) || + (recordStatus == ACL_EVENT_STATUS_COMPLETE)) { TORCH_NPU_CHECK(aclrtDestroyEvent(event)); } else { std::cout << "Warning! NPU destroy event error, status is not completed." << std::endl; diff --git a/src/torch/npu/__init__.py b/src/torch/npu/__init__.py index 2a9d5c79f5d6d21240257ec2e86d54c817f1682a..0a494be8c749b378033e716bd684237543e82fee 100644 --- a/src/torch/npu/__init__.py +++ b/src/torch/npu/__init__.py @@ -161,14 +161,6 @@ def is_available(): return False return device_count() > 0 -def set_option(option): - if not isinstance(option, dict): - raise TypeError("npu option must be a dict.") - - for option_name, option_value in option.items(): - option[option_name] = str(option_value) - - torch._C._npu_setOption(option) class device(object): r"""Context-manager that changes the selected device. @@ -288,28 +280,7 @@ if not hasattr(torch._C, '_NPUStreamBase'): torch._C.__dict__['_NPUStreamBase'] = _dummy_type('NPUStreamBase') torch._C.__dict__['_NPUEventBase'] = _dummy_type('NPUEventBase') - -def init_dump(): - _lazy_init() - option = {} - option["mdldumpswitch"] = "init" - torch._C._npu_setOption(option) - -def set_dump(cfg_file): - if not os.path.exists(cfg_file): - raise AssertionError("cfg_file %s path not exists."%(cfg_file)) - cfg_file = os.path.abspath(cfg_file) - _lazy_init() - option = {} - option["mdldumpconfigpath"] = cfg_file - torch._C._npu_setOption(option) - -def finalize_dump(): - _lazy_init() - option = {} - option["mdldumpswitch"] = "finalize" - torch._C._npu_setOption(option) - from .memory import * -from .streams import Stream, Event \ No newline at end of file +from .streams import Stream, Event +from .npu_frontend_enhance import * \ No newline at end of file diff --git a/src/torch/npu/npu_frontend_enhance.py b/src/torch/npu/npu_frontend_enhance.py new file mode 100644 index 0000000000000000000000000000000000000000..b8e410fbe0aeb5ea65572770cad090892c151255 --- /dev/null +++ b/src/torch/npu/npu_frontend_enhance.py @@ -0,0 +1,73 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch._C +import os +# this file is used to enhance the npu frontend API by set_option or other. + +__all__ = ["set_option", "set_dump", "init_dump", "finalize_dump", "global_step_inc", "set_start_fuzz_compile_step", + "iteration_start", "iteration_end"] + +def set_option(option): + if not isinstance(option, dict): + raise TypeError("npu option must be a dict.") + + for option_name, option_value in option.items(): + option[option_name] = str(option_value) + + torch._C._npu_setOption(option) + +def init_dump(): + option = {"mdldumpswitch":"enable"} + torch._C._npu_setOption(option) + +def set_dump(cfg_file): + if not os.path.exists(cfg_file): + raise AssertionError("cfg_file %s path not exists."%(cfg_file)) + cfg_file = os.path.abspath(cfg_file) + option = {"mdldumpconfigpath": cfg_file} + torch._C._npu_setOption(option) + +def finalize_dump(): + option = {"mdldumpswitch": "disable"} + torch._C._npu_setOption(option) + +def iteration_start(): + option = {"deliverswitch": "enable"} + torch._C._npu_setOption(option) + +def iteration_end(): + option = {"deliverswitch": "disable"} + torch._C._npu_setOption(option) + +_GLOBAL_STEP=0 +_START_FUZZ_COMPILE_STEP=1 +def global_step_inc(): + global _GLOBAL_STEP + _GLOBAL_STEP += 1 + + option = {"fuzzycompileswitch": "enable" if _GLOBAL_STEP >= _START_FUZZ_COMPILE_STEP \ + else "disable"} + torch._C._npu_setOption(option) + +def set_start_fuzz_compile_step(step): + if not isinstance(step, int): + raise TypeError("step must be a int, but got ", type(step)) + + global _START_FUZZ_COMPILE_STEP + _START_FUZZ_COMPILE_STEP = step + option = {"fuzzycompileswitch": "disable"} + torch._C._npu_setOption(option) \ No newline at end of file diff --git a/test/test_npu/test_conv_tbc_backward.py b/test/test_npu/test_conv_tbc_backward.py index 8297bb2ea77411139be924e8b66028f1e080f073..3032ad485915862fb74cbb6f136456e562c67bee 100644 --- a/test/test_npu/test_conv_tbc_backward.py +++ b/test/test_npu/test_conv_tbc_backward.py @@ -97,5 +97,4 @@ class TestConvTbcBackward(TestCase): instantiate_device_type_tests(TestConvTbcBackward, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() diff --git a/test/test_npu/test_instance_norm.py b/test/test_npu/test_instance_norm.py deleted file mode 100644 index f31f8f6ef5fe312f600ddb75b9b5e97585c454f2..0000000000000000000000000000000000000000 --- a/test/test_npu/test_instance_norm.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (c) 2020, Huawei Technologies.All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import numpy as np -from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor - -class TestIn2d(TestCase): - def cpu_op_exec(self, input1, weight, cpu_bias, cpu_running_mean, cpu_running_var, use_input_stats, momentum, epsilon): - output = torch.instance_norm(input1, weight, cpu_bias, cpu_running_mean, cpu_running_var, use_input_stats, momentum, epsilon, cudnn_enabled = False) - return output.numpy() - - def npu_op_exec(self, input1, weight, npu_bias, npu_running_mean, npu_running_var, use_input_stats, momentum, epsilon): - output = torch.instance_norm(input1, weight, npu_bias, npu_running_mean, npu_running_var, use_input_stats, momentum, epsilon, cudnn_enabled = False) - output = output.to("cpu") - return output.numpy() - - def test_instance_norm_shape_format(self, device): - shape_format = [ - [[np.float32, 0, (2, 20, 8, 10)], [np.float32, 0, (20)], [np.float32, 0, (20)], [np.float32, 0, (20)], [np.float32, 0, (20)], False, 0.1, 0.0001], - [[np.float32, 0, (2, 8, 10, 7)], [np.float32, 0, (8)], [np.float32, 0, (8)], [np.float32, 0, (8)], [np.float32, 0, (8)], True, 0.1, 0.0001], - [[np.float32, 0, (2, 10, 20)], [np.float32, -1, (10,)], [np.float32, -1, (10,)],[np.float32, -1, (10,)], [np.float32, -1, (10,)], True, 0.1, 0.0001], - [[np.float32, 3, (6, 20, 2, 3)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], False, 0.1, 0.0001], - [[np.float32, 3, (6, 20, 2, 3)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], True, 0.1, 0.0001], - [[np.float32, 3, (2, 2, 2, 2)], [np.float32, -1, (2,)], [np.float32, -1, (2,)],[np.float32, -1, (2,)], [np.float32, -1, (2,)], True, 0.1, 0.0001] - ] - for item in shape_format: - cpu_input, npu_input = create_common_tensor(item[0], 1, 20) - cpu_input_weight, npu_input_weight = create_common_tensor(item[1], 1, 10) - cpu_bias, npu_bias = create_common_tensor(item[2], 1, 10) - cpu_running_mean, npu_running_mean = create_common_tensor(item[3], 1, 10) - cpu_running_var, npu_running_var = create_common_tensor(item[4], 1, 10) - cpu_result = self.cpu_op_exec(cpu_input, cpu_input_weight, cpu_bias, cpu_running_mean, cpu_running_var, item[5], item[6], item[7]) - npu_result = self.npu_op_exec(npu_input, npu_input_weight, npu_bias, npu_running_mean, npu_running_var, item[5], item[6], item[7]) - self.assertRtolEqual(cpu_result, npu_result) - - def test_instance_norm_fp16_shape_format(self, device): - shape_format = [ - [[np.float16, 0, (2, 15, 4, 2)], [np.float16, 0, (15)], [np.float16, 0, (15)], [np.float16, 0, (15)], [np.float16, 0, (15)], False, 0.1, 0.0001], - [[np.float16, 0, (2, 30, 4, 2)], [np.float16, 0, (30)], [np.float16, 0, (30)], [np.float16, 0, (30)], [np.float16, 0, (30)], True, 0.1, 0.0001], - [[np.float16, 0, (2, 10, 20)], [np.float16, -1, (10,)], [np.float16, -1, (10,)],[np.float16, -1, (10,)], [np.float16, -1, (10,)], True, 0.1, 0.0001], - [[np.float16, 3, (6, 20, 2, 3)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], False, 0.1, 0.0001], - [[np.float16, 3, (6, 20, 2, 3)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], True, 0.1, 0.0001], - [[np.float16, 3, (2, 2, 2, 2)], [np.float16, -1, (2,)], [np.float16, -1, (2,)],[np.float16, -1, (2,)], [np.float16, -1, (2,)], True, 0.1, 0.0001] - ] - def cpu_op_fp16_exec(input1, - weight, - cpu_bias, - cpu_running_mean, - cpu_running_var, - use_input_stats, - momentum, - epsilon): - input1 = input1.to(torch.float32) - weight = weight.to(torch.float32) - cpu_bias = cpu_bias.to(torch.float32) - cpu_running_mean = cpu_running_mean.to(torch.float32) - cpu_running_var = cpu_running_var.to(torch.float32) - - output = torch.instance_norm(input1, - weight, - cpu_bias, - cpu_running_mean, - cpu_running_var, - use_input_stats, - momentum, - epsilon, - cudnn_enabled = False) - output = output.numpy() - return output.astype(np.float16) - - for item in shape_format: - cpu_input, npu_input = create_common_tensor(item[0], 1, 100) - cpu_input_weight, npu_input_weight = create_common_tensor(item[1], 1, 10) - cpu_bias, npu_bias = create_common_tensor(item[2], 1, 10) - cpu_running_mean, npu_running_mean = create_common_tensor(item[3], 1, 10) - cpu_running_var, npu_running_var = create_common_tensor(item[4], 1, 10) - cpu_result = cpu_op_fp16_exec(cpu_input, cpu_input_weight, cpu_bias, cpu_running_mean, cpu_running_var, item[5], item[6], item[7]) - npu_result = self.npu_op_exec(npu_input, npu_input_weight, npu_bias, npu_running_mean, npu_running_var, item[5], item[6], item[7]) - self.assertRtolEqual(cpu_result, npu_result) - -instantiate_device_type_tests(TestIn2d, globals(), except_for="cpu") -if __name__ == "__main__": - run_tests() \ No newline at end of file diff --git a/test/test_npu/test_instancenorm.py b/test/test_npu/test_instancenorm.py deleted file mode 100644 index c1564541c49bb82257554f5633f1be607fb8b42f..0000000000000000000000000000000000000000 --- a/test/test_npu/test_instancenorm.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import numpy as np -import sys -import copy -from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor - -class TestInstanceNorm(TestCase): - - def generate_data(self, min, max, shape, dtype): - x = np.random.uniform(min, max, shape).astype(dtype) - w = np.random.uniform(min, max, shape).astype(dtype) - b = np.random.uniform(min, max, shape).astype(dtype) - rm = np.random.uniform(min, max, shape).astype(dtype) - rv = np.random.uniform(min, max, shape).astype(dtype) - - #modify from numpy.ndarray to torch.tensor - npu_x = torch.from_numpy(x) - npu_w = torch.from_numpy(w) - npu_b = torch.from_numpy(b) - npu_rm = torch.from_numpy(rm) - npu_rv = torch.from_numpy(rv) - - return npu_x, npu_w,npu_b,npu_rm,npu_rv - - def generate_single_data(self, min, max, shape, dtype): - input1 = np.random.uniform(min, max, shape).astype(dtype) - npu_input1 = torch.from_numpy(input1) - - return npu_input1 - - - def generate_three_data(self, min, max, shape, dtype): - input1 = np.random.uniform(min, max, shape).astype(dtype) - input2 = np.random.uniform(min, max, shape).astype(dtype) - input3 = np.random.uniform(min, max, shape).astype(dtype) - - #modify from numpy.ndarray to torch.tensor - npu_input1 = torch.from_numpy(input1) - npu_input2 = torch.from_numpy(input2) - npu_input3 = torch.from_numpy(input3) - - return npu_input1, npu_input2, npu_input3 - - - def cpu_op_exec(self, x, w,b,rm,rv,use_input_stats, momentum, eps): - axis = [] - for i in range(2,len(x.shape)): - axis.append(i) - mean = np.mean(x, tuple(axis), keepdims=True) - var = np.var(x, tuple(axis), keepdims=True) - - if input_use ==True: - mean = (mean-momentum*mean) + momentum*rm - var = (var-momentum*var) + momentum*rv - print("11") - y = (x - mean)/np.sqrt(var + eps) - output = w*y + b - else: - y = (x - mean)/np.sqrt(var + eps) - output = w*y + b - output = output.numpy() - return output - - - def npu_op_exec(self, x, w,b,rm,rv,use_input_stats, momentum, eps): - x = x.to("npu") - w = w.to("npu") - b = b.to("npu") - rm = rm.to("npu") - rv = rv.to("npu") - axis = [] - for i in range(2,len(x.shape)): - axis.append(i) - mean = np.mean(x, tuple(axis), keepdims=True) - var = np.var(x, tuple(axis), keepdims=True) - - if input_use ==True: - mean = (mean-momentum*mean) + momentum*rm - var = (var-momentum*var) + momentum*rv - print("11") - y = (x - mean)/np.sqrt(var + eps) - output = w*y + b - else: - y = (x - mean)/np.sqrt(var + eps) - output = w*y + b - output = output.to("cpu") - output = output.numpy() - return output - - - def npu_op_exec_scalar(self, input1, input2): - input1 = input1.to("npu") - output = input1 + input2 - output = output.to("cpu") - output = output.numpy() - return output - - - def npu_op_exec_out(self, input1, input2, input3): - input1 = input1.to("npu") - input2 = input2.to("npu") - output = input3.to("npu") - torch.add(input1, input2, out=output) - output = output.to("cpu") - output = output.numpy() - return output - - def test_add_float16(self, device): - npu_x, npu_w,npu_b,npu_rm,npu_rv = self.generate_data(0, 100, (5, 6, 7), np.float16) - cpu_output = self.cpu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) - npu_output = self.npu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) - self.assertRtolEqual(cpu_output, npu_output) - - - def test_add_float32(self, device): - npu_x, npu_w,npu_b,npu_rm,npu_rv = self.generate_data(0, 100, (5, 6, 7), np.float32) - cpu_output = self.cpu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) - npu_output = self.npu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) - self.assertRtolEqual(cpu_output, npu_output) - - - def test_add_float32_out(self, device): - npu_input1, npu_input2, npu_input3 = generate_three_data(0, 100, (4,3), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, npu_input2) - npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) - self.assertRtolEqual(cpu_output, npu_output) - - - def test_add_float32_broadcast(self, device): - npu_input1 = self.generate_single_data(0, 100, (4,3,1), np.float32) - npu_input2 = self.generate_single_data(0, 100, (4,1,5), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, npu_input2) - npu_output = self.npu_op_exec(npu_input1, npu_input2) - self.assertRtolEqual(cpu_output, npu_output) - - - def test_add_int32(self, device): - npu_input1, npu_input2 = self.generate_data(0, 100, (2,3), np.int32) - cpu_output = self.cpu_op_exec(npu_input1, npu_input2) - npu_output = self.npu_op_exec(npu_input1, npu_input2) - self.assertRtolEqual(cpu_output, npu_output) - - - def test_add_scalar_float32(self, device): - npu_input1, _= self.generate_data(0, 100, (2,3), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, 1) - npu_output = self.npu_op_exec_scalar(npu_input1, 1) - self.assertRtolEqual(cpu_output, npu_output) - - - def npu_uncontiguous_op_exec_scalar(self, input1, input2): - input1 = input1.to("npu") - input1 = input1.as_strided([2,2], [1,2], 1) - output = torch.add(input1, input2) - output = output.to("cpu") - output = output.numpy() - return output - - def cpu_uncontiguous_op_exec_scalar(self, input1, input2): - input1 = input1.as_strided([2,2], [1,2], 1) - output = torch.add(input1, input2) - output = output.numpy() - return output - - def test_add_uncontiguous_float32_scalar(self, device): - npu_input1, npu_input2 = self.generate_data(0, 100, (4,3), np.float32) - cpu_input1 = copy.deepcopy(npu_input1) - cpu_output = self.cpu_uncontiguous_op_exec_scalar(cpu_input1, 2) - npu_output = self.npu_uncontiguous_op_exec_scalar(npu_input1, 2) - self.assertRtolEqual(cpu_output, npu_output) - -instantiate_device_type_tests(TestInstanceNorm, globals(), except_for='cpu') -if __name__ == '__main__': - torch.npu.set_device("npu:2") - run_tests() - diff --git a/test/test_npu/test_multilabel_margin_loss_backward.py b/test/test_npu/test_multilabel_margin_loss_backward.py deleted file mode 100644 index 30f22481f58db57753cf37df8f727cebecd90503..0000000000000000000000000000000000000000 --- a/test/test_npu/test_multilabel_margin_loss_backward.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) 2020, Huawei Technologies.All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import numpy as np -import sys -import copy -from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor -from itertools import repeat, product - -class TestMultilabelMarginLossGrad(TestCase): - - def generate_data(self, lo, hi, shape, dtype): - grad = np.random.uniform(lo, hi, (shape[0],)).astype(dtype) - predict = np.random.uniform(lo, hi, shape).astype(dtype) - npu_grad = torch.from_numpy(grad) - npu_predict = torch.from_numpy(predict) - return npu_grad, npu_predict - - def generate_target(self, lo, hi, shape, dtype): - target = np.random.randint(lo, hi, shape).astype(dtype) - npu_target = torch.from_numpy(target) - return npu_target - - def cpu_op_grad_exec(self, grad_output, predict, target, reduction): - predict.requires_grad = True - target = target.to(torch.int64) - out = torch.nn.functional.multilabel_margin_loss(input=predict, target=target, reduction=reduction) - if reduction == "none": - out.backward(grad_output) - else: - out.backward() - output = predict.grad.to(torch.float32).numpy() - return output - - def npu_op_grad_exec(self, grad_output, predict, target, reduction): - grad_output = grad_output.to("npu") - predict = predict.to("npu") - target = target.to("npu") - predict.requires_grad = True - out = torch.nn.functional.multilabel_margin_loss(input=predict, target=target, reduction=reduction) - if reduction == "none": - out.backward(grad_output) - else: - out.backward() - output = predict.grad.to("cpu").to(torch.float32).numpy() - return output - - def test_multilabel_margin_loss_1(self, device): - for reduction in ["none", "mean", "sum"]: - grad, data = self.generate_data(-2, 2, (2, 4), np.float32) - target = self.generate_target(-1, 3, (2, 4), np.int32) - - data.requires_grad = False - cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction) - data.requires_grad = False - npu_output = self.npu_op_grad_exec(grad, data, target, reduction) - - self.assertRtolEqual(cpu_output, npu_output) - - def test_multilabel_margin_loss_2(self, device): - for reduction in ["mean", "none", "sum"]: - grad, data = self.generate_data(-2, 2, (2, 9), np.float32) - target = self.generate_target(-1, 8, (2, 9), np.int32) - - data.requires_grad = False - cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction) - data.requires_grad = False - npu_output = self.npu_op_grad_exec(grad, data, target, reduction) - - self.assertRtolEqual(cpu_output, npu_output) - - def test_multilabel_margin_loss_3(self, device): - for reduction in ["mean", "none", "sum"]: - grad, data = self.generate_data(-2, 2, (64, 147), np.float32) - target = self.generate_target(-1, 146, (64, 147), np.int32) - - data.requires_grad = False - cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction) - data.requires_grad = False - npu_output = self.npu_op_grad_exec(grad, data, target, reduction) - - self.assertRtolEqual(cpu_output, npu_output) - - def test_multilabel_margin_loss_float16_1(self, device): - for reduction in ["mean", "none", "sum"]: - grad, data = self.generate_data(-2, 2, (2, 4), np.float16) - target = self.generate_target(-1, 3, (2, 4), np.int32) - - data.requires_grad = False - grad = grad.to(torch.float32) - data = data.to(torch.float32) - cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction) - data.requires_grad = False - grad = grad.to(torch.float16) - data = data.to(torch.float16) - npu_output = self.npu_op_grad_exec(grad, data, target, reduction) - - cpu_output = cpu_output.astype(np.float16) - npu_output = npu_output.astype(np.float16) - - self.assertRtolEqual(cpu_output, npu_output) - - def test_multilabel_margin_loss_float16_2(self, device): - for reduction in ["mean", "none", "sum"]: - grad, data = self.generate_data(-2, 2, (2, 9), np.float16) - target = self.generate_target(-1, 8, (2, 9), np.int32) - - data.requires_grad = False - grad = grad.to(torch.float32) - data = data.to(torch.float32) - cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction) - data.requires_grad = False - grad = grad.to(torch.float16) - data = data.to(torch.float16) - npu_output = self.npu_op_grad_exec(grad, data, target, reduction) - - cpu_output = cpu_output.astype(np.float16) - npu_output = npu_output.astype(np.float16) - - self.assertRtolEqual(cpu_output, npu_output) - - def test_multilabel_margin_loss_float16_3(self, device): - for reduction in ["mean", "none", "sum"]: - grad, data = self.generate_data(-2, 2, (1, 79), np.float16) - target = self.generate_target(-1, 50, (1, 79), np.int32) - - data.requires_grad = False - grad = grad.to(torch.float32) - data = data.to(torch.float32) - cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction) - data.requires_grad = False - grad = grad.to(torch.float16) - data = data.to(torch.float16) - npu_output = self.npu_op_grad_exec(grad, data, target, reduction) - - cpu_output = cpu_output.astype(np.float16) - npu_output = npu_output.astype(np.float16) - - self.assertRtolEqual(cpu_output, npu_output) - - def test_multilabel_margin_loss_float16_4(self, device): - for reduction in ["none", "sum", "mean"]: - grad, data = self.generate_data(-2, 2, (64, 147), np.float16) - target = self.generate_target(-1, 146, (64, 147), np.int32) - - data.requires_grad = False - grad = grad.to(torch.float32) - data = data.to(torch.float32) - cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction) - data.requires_grad = False - grad = grad.to(torch.float16) - data = data.to(torch.float16) - npu_output = self.npu_op_grad_exec(grad, data, target, reduction) - - cpu_output = cpu_output.astype(np.float16) - npu_output = npu_output.astype(np.float16) - - self.assertRtolEqual(cpu_output, npu_output) - -instantiate_device_type_tests(TestMultilabelMarginLossGrad, globals(), except_for="cpu") -if __name__ == "__main__": - run_tests() diff --git a/test/test_npu/test_network_ops/run_tests.py b/test/test_npu/test_network_ops/run_tests.py index 58f1a6454e431f514dc7cd67af76c8b443ef4f3a..69a7867d667ac0ac998e599d8cc201590f2bc404 100644 --- a/test/test_npu/test_network_ops/run_tests.py +++ b/test/test_npu/test_network_ops/run_tests.py @@ -42,18 +42,24 @@ def run_tests(): import HTMLTestRunner with open(htmlFileName, "wb") as report_file: runner=HTMLTestRunner.HTMLTestRunner(stream=report_file, title='AllTest', description='all npu test case', verbosity=2) - runner.run(load_local_case(test_case_path)) + result = runner.run(load_local_case(test_case_path)) + if not result.wasSuccessful(): + raise RuntimeError("Some cases of HTML unittest testset failed") print('report files path', htmlFileName) elif ENABLE_HTML_MX: print('start pytorch Multi HTML unittest testset...') import HtmlTestRunner runner=HtmlTestRunner.HTMLTESTRunner(output=test_report_path, verbosity=2) - runner=run(load_local_case(test_case_path)) + result=runner.run(load_local_case(test_case_path)) + if not result.wasSuccessful(): + raise RuntimeError("Some cases of Multi HTML unittest testset failed") else: print('start pytorch TEXT unittest testset...') with open(txtFileName, "a") as report_file: runner=unittest.TextTestRunner(stream=report_file, verbosity=2) - runner.run(load_local_case(test_case_path)) + result=runner.run(load_local_case(test_case_path)) + if not result.wasSuccessful(): + raise RuntimeError("Some cases TEXT unittest failed") print('report files path', txtFileName) if __name__=="__main__": diff --git a/test/test_npu/test__Ixor__.py b/test/test_npu/test_network_ops/test__Ixor__.py similarity index 100% rename from test/test_npu/test__Ixor__.py rename to test/test_npu/test_network_ops/test__Ixor__.py diff --git a/test/test_npu/test___iand__.py b/test/test_npu/test_network_ops/test___iand__.py similarity index 99% rename from test/test_npu/test___iand__.py rename to test/test_npu/test_network_ops/test___iand__.py index d16107c8be5e949f02d39c4df45e382f367e4d6f..1270faeab0582fd9865cfefd88020d54725a8868 100644 --- a/test/test_npu/test___iand__.py +++ b/test/test_npu/test_network_ops/test___iand__.py @@ -134,5 +134,4 @@ class Test__Iand__(TestCase): instantiate_device_type_tests(Test__Iand__, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() \ No newline at end of file diff --git a/test/test_npu/test__nnpack_spatial_convolution.py b/test/test_npu/test_network_ops/test__nnpack_spatial_convolution.py similarity index 99% rename from test/test_npu/test__nnpack_spatial_convolution.py rename to test/test_npu/test_network_ops/test__nnpack_spatial_convolution.py index daaed945793c43aa428931942e48aaf7e23e7abd..a89c9724848b1a55de65c325a11fad7adf835545 100644 --- a/test/test_npu/test__nnpack_spatial_convolution.py +++ b/test/test_npu/test_network_ops/test__nnpack_spatial_convolution.py @@ -138,7 +138,6 @@ class TestNnpackSpatialConvolution(TestCase): instantiate_device_type_tests(TestNnpackSpatialConvolution, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() diff --git a/test/test_npu/test_acos.py b/test/test_npu/test_network_ops/test_acos.py similarity index 98% rename from test/test_npu/test_acos.py rename to test/test_npu/test_network_ops/test_acos.py index 97bad337923a690f98a0beb328eaaf7a448e6f1b..bd03b4be44afd71929ec196694ce838c9e151e6f 100644 --- a/test/test_npu/test_acos.py +++ b/test/test_npu/test_network_ops/test_acos.py @@ -66,6 +66,5 @@ class TestAcos(TestCase): instantiate_device_type_tests(TestAcos, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() diff --git a/test/test_npu/test_adaptive_avg_pool2d_backward.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool2d_backward.py similarity index 100% rename from test/test_npu/test_adaptive_avg_pool2d_backward.py rename to test/test_npu/test_network_ops/test_adaptive_avg_pool2d_backward.py diff --git a/test/test_npu/test_adaptive_max_pool2d_backward.py b/test/test_npu/test_network_ops/test_adaptive_max_pool2d_backward.py similarity index 100% rename from test/test_npu/test_adaptive_max_pool2d_backward.py rename to test/test_npu/test_network_ops/test_adaptive_max_pool2d_backward.py diff --git a/test/test_npu/test_addbmm.py b/test/test_npu/test_network_ops/test_addbmm.py similarity index 100% rename from test/test_npu/test_addbmm.py rename to test/test_npu/test_network_ops/test_addbmm.py diff --git a/test/test_npu/test_addcdiv.py b/test/test_npu/test_network_ops/test_addcdiv.py similarity index 100% rename from test/test_npu/test_addcdiv.py rename to test/test_npu/test_network_ops/test_addcdiv.py diff --git a/test/test_npu/test_addmv.py b/test/test_npu/test_network_ops/test_addmv.py similarity index 100% rename from test/test_npu/test_addmv.py rename to test/test_npu/test_network_ops/test_addmv.py diff --git a/test/test_npu/test_addr.py b/test/test_npu/test_network_ops/test_addr.py similarity index 100% rename from test/test_npu/test_addr.py rename to test/test_npu/test_network_ops/test_addr.py diff --git a/test/test_npu/test_affine_grid_generator_backward.py b/test/test_npu/test_network_ops/test_affine_grid_generator_backward.py similarity index 100% rename from test/test_npu/test_affine_grid_generator_backward.py rename to test/test_npu/test_network_ops/test_affine_grid_generator_backward.py diff --git a/test/test_npu/test_asin.py b/test/test_npu/test_network_ops/test_asin.py similarity index 98% rename from test/test_npu/test_asin.py rename to test/test_npu/test_network_ops/test_asin.py index 54e32964b870ed52dc84ca4d629d458df8d610fb..537bbc12bc8f55719798de68a8a1d3c093dc1459 100644 --- a/test/test_npu/test_asin.py +++ b/test/test_npu/test_network_ops/test_asin.py @@ -62,5 +62,4 @@ class TestAsin(TestCase): instantiate_device_type_tests(TestAsin, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_bartlett_window.py b/test/test_npu/test_network_ops/test_bartlett_window.py similarity index 98% rename from test/test_npu/test_bartlett_window.py rename to test/test_npu/test_network_ops/test_bartlett_window.py index 2cfa2aefb345e048a6be4ba3233e826ecbf3ddea..4a9be1452610e1de63358d9996da3dcd89f74b09 100644 --- a/test/test_npu/test_bartlett_window.py +++ b/test/test_npu/test_network_ops/test_bartlett_window.py @@ -78,5 +78,4 @@ class TestBartlettWindow(TestCase): instantiate_device_type_tests(TestBartlettWindow, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() diff --git a/test/test_npu/test_batch_norm.py b/test/test_npu/test_network_ops/test_batch_norm.py similarity index 100% rename from test/test_npu/test_batch_norm.py rename to test/test_npu/test_network_ops/test_batch_norm.py diff --git a/test/test_npu/test_binary_cross_entropy_with_logits.py b/test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits.py similarity index 100% rename from test/test_npu/test_binary_cross_entropy_with_logits.py rename to test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits.py diff --git a/test/test_npu/test_network_ops/test_bitwise_not.py b/test/test_npu/test_network_ops/test_bitwise_not.py index b83234feb6ae4a97337082f8d55e108be006bea3..a10801063b1b47b54932dac4080ed43f9c79a6a5 100644 --- a/test/test_npu/test_network_ops/test_bitwise_not.py +++ b/test/test_npu/test_network_ops/test_bitwise_not.py @@ -101,5 +101,4 @@ class Test_Bitwise_Not(TestCase): instantiate_device_type_tests(Test_Bitwise_Not, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() diff --git a/test/test_npu/test_blackman_window.py b/test/test_npu/test_network_ops/test_blackman_window.py similarity index 98% rename from test/test_npu/test_blackman_window.py rename to test/test_npu/test_network_ops/test_blackman_window.py index 8a600bb0805ac1229cf9f7dad8ac6434e804cb2e..10b7dd27924eccd7f91ba4394d458d51dc5d5c30 100644 --- a/test/test_npu/test_blackman_window.py +++ b/test/test_npu/test_network_ops/test_blackman_window.py @@ -92,5 +92,4 @@ class TestBlackmanWindow(TestCase): instantiate_device_type_tests(TestBlackmanWindow, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_network_ops/test_bmm_v2.py b/test/test_npu/test_network_ops/test_bmm_v2.py index 6ea92d3e4fba286eca8c3f43220bf3993bc94aa5..3590e3ecbd712854f5f1fe79e215c032b9f29886 100644 --- a/test/test_npu/test_network_ops/test_bmm_v2.py +++ b/test/test_npu/test_network_ops/test_bmm_v2.py @@ -25,7 +25,7 @@ class TestBatchMatMulV2(TestCase): return output def npu_op_exec(self, input1, input2): - output = torch.npu_bmmV2(input1, input2) + output = torch.npu_bmmV2(input1, input2, []) output = output.to("cpu") output = output.numpy() return output diff --git a/test/test_npu/test_cast_Byte.py b/test/test_npu/test_network_ops/test_cast_Byte.py similarity index 97% rename from test/test_npu/test_cast_Byte.py rename to test/test_npu/test_network_ops/test_cast_Byte.py index c06faec158068025b17af25a9da53a50e4f54d5b..1393faf3234192127a35e94e07df13f2c7e77cd2 100644 --- a/test/test_npu/test_cast_Byte.py +++ b/test/test_npu/test_network_ops/test_cast_Byte.py @@ -20,7 +20,7 @@ import sys import copy from common_utils import TestCase, run_tests from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor,compare_res_new +from util_test import create_common_tensor class TestCastByte(TestCase): diff --git a/test/test_npu/test_cast_Char.py b/test/test_npu/test_network_ops/test_cast_Char.py similarity index 100% rename from test/test_npu/test_cast_Char.py rename to test/test_npu/test_network_ops/test_cast_Char.py diff --git a/test/test_npu/test_cast_Float.py b/test/test_npu/test_network_ops/test_cast_Float.py similarity index 100% rename from test/test_npu/test_cast_Float.py rename to test/test_npu/test_network_ops/test_cast_Float.py diff --git a/test/test_npu/test_cast_Half.py b/test/test_npu/test_network_ops/test_cast_Half.py similarity index 100% rename from test/test_npu/test_cast_Half.py rename to test/test_npu/test_network_ops/test_cast_Half.py diff --git a/test/test_npu/test_cast_Int.py b/test/test_npu/test_network_ops/test_cast_Int.py similarity index 100% rename from test/test_npu/test_cast_Int.py rename to test/test_npu/test_network_ops/test_cast_Int.py diff --git a/test/test_npu/test_cast_Long.py b/test/test_npu/test_network_ops/test_cast_Long.py similarity index 100% rename from test/test_npu/test_cast_Long.py rename to test/test_npu/test_network_ops/test_cast_Long.py diff --git a/test/test_npu/test_cast_Short.py b/test/test_npu/test_network_ops/test_cast_Short.py similarity index 100% rename from test/test_npu/test_cast_Short.py rename to test/test_npu/test_network_ops/test_cast_Short.py diff --git a/test/test_npu/test_cdist.py b/test/test_npu/test_network_ops/test_cdist.py similarity index 99% rename from test/test_npu/test_cdist.py rename to test/test_npu/test_network_ops/test_cdist.py index b7b0fd03b83d5e41331c302b1d3f78028c568ade..fa6553d8428219658417b6dc6e87f04db546d2cc 100644 --- a/test/test_npu/test_cdist.py +++ b/test/test_npu/test_network_ops/test_cdist.py @@ -190,5 +190,4 @@ class Testcdist(TestCase): instantiate_device_type_tests(Testcdist, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() diff --git a/test/test_npu/test_cdist_backward.py b/test/test_npu/test_network_ops/test_cdist_backward.py similarity index 99% rename from test/test_npu/test_cdist_backward.py rename to test/test_npu/test_network_ops/test_cdist_backward.py index d0ee34ce400a9198e4307d69978e6b897f4d0af4..a61f69d70e980c70ecec68a2637dcf9a5ec513da 100644 --- a/test/test_npu/test_cdist_backward.py +++ b/test/test_npu/test_network_ops/test_cdist_backward.py @@ -115,5 +115,4 @@ class Testcdist(TestCase): instantiate_device_type_tests(Testcdist, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() diff --git a/test/test_npu/test_celu.py b/test/test_npu/test_network_ops/test_celu.py similarity index 64% rename from test/test_npu/test_celu.py rename to test/test_npu/test_network_ops/test_celu.py index 1dc6a7c49bf27e4a706077f5613617cae9d39611..1ef1fe9f19cb4d27bb3741b8201f2a717d73f84b 100644 --- a/test/test_npu/test_celu.py +++ b/test/test_npu/test_network_ops/test_celu.py @@ -28,27 +28,49 @@ class TestCelu(TestCase): npu_input = torch.from_numpy(input_x) return npu_input - def cpu_op_exec(self, input1, alpha): + def cpu_op_exec_functional(self, input1, alpha): output = torch.nn.functional.celu(input1, alpha=alpha) output = output.numpy() return output - def npu_op_exec(self, input1, alpha): + def npu_op_exec_functional(self, input1, alpha): output = torch.nn.functional.celu(input1, alpha=alpha) output = output.to("cpu") output = output.numpy() return output + + def cpu_op_exec(self, input1, alpha): + output = torch.celu(input1, alpha=alpha) + output = output.numpy() + return output - def cpu_op_inplace_exec(self, input1, alpha): + def npu_op_exec(self, input1, alpha): + output = torch.celu(input1, alpha=alpha) + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_inplace_exec_functional(self, input1, alpha): output = torch.nn.functional.celu_(input1, alpha=alpha) output = output.numpy() return output - def npu_op_inplace_exec(self, input1, alpha): + def npu_op_inplace_exec_functional(self, input1, alpha): output = torch.nn.functional.celu_(input1, alpha=alpha) output = output.to("cpu") output = output.numpy() return output + + def cpu_op_inplace_exec(self, input1, alpha): + output = torch.celu_(input1, alpha=alpha) + output = output.numpy() + return output + + def npu_op_inplace_exec(self, input1, alpha): + output = torch.celu_(input1, alpha=alpha) + output = output.to("cpu") + output = output.numpy() + return output def test_celu_3_3_float32_alpha1(self, device): input_x1 = self.generate_data(-1, 1, (3, 3), np.float32) @@ -157,8 +179,58 @@ class TestCelu(TestCase): cpu_output = self.cpu_op_inplace_exec(cpu_input1, 2.0) npu_output = self.npu_op_inplace_exec(npu_input1, 2.0) self.assertRtolEqual(cpu_output, npu_output) + + def test_celu_inplace_shape_format_alpha_range(self, device): + shape_format_alpha_range = [ + # [[dtype, format, shape], alpha, min, max] + [[np.float16, 2, (16, 5, 7, 11)], 5.6, -2, 2], + [[np.float32, 2, (16, 5, 7, 11)], 0.5, -2, 2], + [[np.float32, 2, (16, 5, 7, 11)], 0.7, -2, 2], + [[np.float32, 2, (16, 5, 7, 11)], 2.6, -2, 2], + [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.0078125, 0.0078125], + [[np.float16, 2, (16, 136, 5, 4)], 0.7, -0.0078125, 0.0078125], + [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.01, 0.01], + [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 0.5, -2, 2], + [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 5.4, -2, 2], + [[np.float16, 2, (23, 5, 11, 50, 26, 13, 1, 23)], 0.5, -2, 2], + [[np.float16, 2, (2560, 17)], 0.5, -2, 2], + [[np.float16, 2, (2560, 17)], 5.4, -2, 2] + ] + for item in shape_format_alpha_range: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[2], item[3]) + alpha = item[1] + npu_output = self.npu_op_inplace_exec(npu_input1, alpha) + if item[0][0] == np.float16: + cpu_output = self.cpu_op_inplace_exec(cpu_input1.float(), alpha).astype(np.float16) + else: + cpu_output = self.cpu_op_inplace_exec(cpu_input1, alpha) + self.assertRtolEqual(cpu_output, npu_output) + + def test_celu_inplace_shape_format_alpha_range(self, device): + shape_format_alpha_range = [ + # [[dtype, format, shape], alpha, min, max] + [[np.float32, 2, (16, 5, 7, 11)], 0.5, -2, 2], + [[np.float32, 2, (16, 5, 7, 11)], 0.7, -2, 2], + [[np.float32, 2, (16, 5, 7, 11)], 2.6, -2, 2], + [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.0078125, 0.0078125], + [[np.float16, 2, (16, 136, 5, 4)], 0.7, -0.0078125, 0.0078125], + [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.01, 0.01], + [[np.float16, 2, (16, 136, 5, 4)], 0.7, -0.01, 0.01], + [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 0.5, -2, 2], + [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 5.4, -2, 2], + [[np.float16, 2, (2560, 17)], 0.5, -2, 2], + [[np.float16, 2, (2560, 17)], 5.4, -2, 2] + ] + for item in shape_format_alpha_range: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[2], item[3]) + alpha = item[1] + npu_output = self.npu_op_exec(npu_input1, alpha) + if item[0][0] == np.float16: + cpu_output = self.cpu_op_exec(cpu_input1.float(), alpha).astype(np.float16) + else: + cpu_output = self.cpu_op_exec(cpu_input1, alpha) + self.assertRtolEqual(cpu_output, npu_output) instantiate_device_type_tests(TestCelu, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") - run_tests() \ No newline at end of file + run_tests() diff --git a/test/test_npu/test_conv_tbc.py b/test/test_npu/test_network_ops/test_conv_tbc.py similarity index 90% rename from test/test_npu/test_conv_tbc.py rename to test/test_npu/test_network_ops/test_conv_tbc.py index aeb8eca4a2500760ae6bc1781a7e0956ffec9d9e..47d799e33ad3c0b215a7bc41a0380faf90812fe8 100644 --- a/test/test_npu/test_conv_tbc.py +++ b/test/test_npu/test_network_ops/test_conv_tbc.py @@ -27,8 +27,6 @@ class TestConvTbc(TestCase): def op_exec_cpu(self, input1, weight, bias, pad): cpu_output = torch.conv_tbc(input1, weight, bias, pad) cpu_output = cpu_output.numpy().astype('float16') - print("===cpu_output===") - print(cpu_output) return cpu_output def op_exec_npu(self, input1, weight, bias, pad): @@ -38,8 +36,6 @@ class TestConvTbc(TestCase): npu_output = torch.conv_tbc(input1, weight, bias, pad) npu_output = npu_output.to("cpu") npu_output = npu_output.numpy().astype('float16') - print("===npu_output===") - print(npu_output) return npu_output def test_conv_tbc_shape_format(self, device): @@ -55,11 +51,8 @@ class TestConvTbc(TestCase): pad = 1 cpu_output = self.op_exec_cpu(cpu_input, cpu_weight, cpu_bias, pad) npu_output = self.op_exec_npu(npu_input, npu_weight, npu_bias, pad) - res = abs((cpu_output - npu_output)/cpu_output) - print(res) self.assertRtolEqual(cpu_output, npu_output) instantiate_device_type_tests(TestConvTbc, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() diff --git a/test/test_npu/test_conv_transpose2d.py b/test/test_npu/test_network_ops/test_conv_transpose2d.py similarity index 63% rename from test/test_npu/test_conv_transpose2d.py rename to test/test_npu/test_network_ops/test_conv_transpose2d.py index e62981ef9af99b89a3c48d03905bc694d0095571..6e9b72b96802b44e25f7dcaaa6dafc7d0fff9bd6 100644 --- a/test/test_npu/test_conv_transpose2d.py +++ b/test/test_npu/test_network_ops/test_conv_transpose2d.py @@ -23,50 +23,62 @@ from util_test import create_common_tensor class TestConvTranspose2d(TestCase): - def cpu_op_exec(self, input, weight): - cpu_output = torch.nn.functional.conv_transpose2d(input, weight,bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) + def cpu_op_exec(self, input, weight, groups): + cpu_output = torch.nn.functional.conv_transpose2d(input, weight,bias=None, + stride=1, padding=0, output_padding=0, groups=groups, dilation=1) cpu_output = cpu_output.numpy() return cpu_output - def cpu_op_exec_fp16(self, input, weight): + def cpu_op_exec_fp16(self, input, weight, groups): input = input.to(torch.float32) weight = weight.to(torch.float32) - cpu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) + cpu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, + stride=1, padding=0, output_padding=0, groups=groups, dilation=1) cpu_output = cpu_output.numpy() cpu_output = cpu_output.astype(np.float16) return cpu_output - def npu_op_exec(self, input, weight): + def npu_op_exec(self, input, weight, groups): input = input.to("npu") weight = weight.to("npu") - npu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) + npu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, + stride=1, padding=0, output_padding=0, groups=groups, dilation=1) npu_output = npu_output.to("cpu").numpy() return npu_output def test_conv_transpose2d(self, device): - shape_format = [ # input, weight - [[np.float16, 3, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]]], - [[np.float16, 3, [1024, 58, 28, 28]], [np.float16, 3, [58, 58, 1, 1]]], - [[np.float16, 4, [1024, 3, 224, 224]], [np.float16, 4, [3, 3, 3, 3]]], - [[np.float16, 0, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]]], - [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 4, [232, 232, 1, 1]]], - [[np.float16, 4, [1024, 58, 28, 28]], [np.float16, 4, [58, 58, 1, 1]]], - [[np.float16, 0, [1024, 24, 56, 56]], [np.float16, 4, [24, 24, 1, 1]]], - [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 4, [128, 128, 3, 3]]], - [[np.float32, 4, [256, 3, 224, 224]], [np.float32, 4, [3, 3, 7, 7]]], - [[np.float32, 3, [2, 3, 3, 3]], [np.float32, 4, [3, 1, 3, 3]]], - [[np.float32, 3, [1024, 232, 7, 7]], [np.float32, 4, [232, 232, 1, 1]]], + shape_format = [ + # input, weight + [[np.float16, 3, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]], 1], + [[np.float16, 3, [1024, 58, 28, 28]], [np.float16, 3, [58, 58, 1, 1]], 1], + [[np.float16, 4, [1024, 3, 224, 224]], [np.float16, 4, [3, 3, 3, 3]], 1], + [[np.float16, 0, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]], 1], + [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 4, [232, 232, 1, 1]], 1], + [[np.float16, 4, [1024, 58, 28, 28]], [np.float16, 4, [58, 58, 1, 1]], 1], + [[np.float16, 0, [1024, 24, 56, 56]], [np.float16, 4, [24, 24, 1, 1]], 1], + [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 4, [128, 128, 3, 3]], 1], + [[np.float32, 4, [256, 3, 224, 224]], [np.float32, 4, [3, 3, 7, 7]], 1], + [[np.float32, 3, [2, 3, 3, 3]], [np.float32, 4, [3, 1, 3, 3]], 1], + [[np.float32, 3, [1024, 232, 7, 7]], [np.float32, 4, [232, 232, 1, 1]], 1], + [[np.float16, 3, [1024, 116*3, 14, 14]], [np.float16, 4, [116*3, 150//3, 1, 1]], 3], + [[np.float16, 3, [1024, 58*2, 28, 28]], [np.float16, 3, [58*2, 58//2, 1, 1]], 2], + [[np.float16, 0, [1, 3*3, 224, 224]], [np.float16, 0, [3*3, 1, 3, 3]], 3], + [[np.float16, 0, [1024, 116*4, 14, 14]], [np.float16, 4, [116*4, 116//4, 1, 1]], 4], + [[np.float32, 3, [1024, 116*3, 14, 14]], [np.float32, 4, [116*3, 150//3, 1, 1]], 3], + [[np.float32, 3, [1024, 58*2, 28, 28]], [np.float32, 3, [58*2, 58//2, 1, 1]], 2], + [[np.float32, 0, [1, 3*3, 224, 224]], [np.float32, 0, [3*3, 1, 3, 3]], 3], + [[np.float32, 0, [1024, 116*4, 14, 14]], [np.float32, 4, [116*4, 116//4, 1, 1]], 4], ] for item in shape_format: input_cpu, input_npu = create_common_tensor(item[0], 0, 10) weight_cpu, weight_npu = create_common_tensor(item[1], 0, 10) if input_cpu.dtype == torch.float16: - cpu_output = self.cpu_op_exec_fp16(input_cpu, weight_cpu) + cpu_output = self.cpu_op_exec_fp16(input_cpu, weight_cpu, item[-1]) else: - cpu_output = self.cpu_op_exec(input_cpu, weight_cpu) - npu_output = self.npu_op_exec(input_npu, weight_npu) + cpu_output = self.cpu_op_exec(input_cpu, weight_cpu, item[-1]) + npu_output = self.npu_op_exec(input_npu, weight_npu, item[-1]) # fp32精度不足,放宽对其精度要求 self.assertRtolEqual(cpu_output, npu_output, prec=1.e-1) diff --git a/test/test_npu/test_convolution_backward_weight.py b/test/test_npu/test_network_ops/test_convolution_backward_weight.py similarity index 95% rename from test/test_npu/test_convolution_backward_weight.py rename to test/test_npu/test_network_ops/test_convolution_backward_weight.py index de421a9552067d7bf36b3fa07342b4202ecbf83f..beaf25c285e64c0f8c1c81f48ab61f10e3a7f369 100644 --- a/test/test_npu/test_convolution_backward_weight.py +++ b/test/test_npu/test_network_ops/test_convolution_backward_weight.py @@ -17,7 +17,7 @@ import numpy as np import sys import copy from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests +from common_device_type import instantiate_device_type_tests from util_test import create_common_tensor @@ -98,13 +98,12 @@ class TestCudnnConvolutionBackwardWeight(TestCase): item[3], item[4], item[5]) cpu_output = cpu_output.astype(npu_output.dtype) cpu_dweight = cpu_dweight.to(npu_dweight.dtype) - self.assertRtolEqual(cpu_output, npu_output) - self.assertRtolEqual(cpu_dweight, npu_dweight) + self.assertRtolEqual(cpu_output, npu_output, 0.007) + self.assertRtolEqual(cpu_dweight, npu_dweight, 0.003) instantiate_device_type_tests(TestCudnnConvolutionBackwardWeight, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_convolution_transpose_backward_weight.py b/test/test_npu/test_network_ops/test_convolution_transpose_backward_weight.py similarity index 99% rename from test/test_npu/test_convolution_transpose_backward_weight.py rename to test/test_npu/test_network_ops/test_convolution_transpose_backward_weight.py index 76fc807c7166a17b2b23aba6a75a439b3156b93f..99f99300873518835324658c9ef154339769921e 100644 --- a/test/test_npu/test_convolution_transpose_backward_weight.py +++ b/test/test_npu/test_network_ops/test_convolution_transpose_backward_weight.py @@ -110,5 +110,4 @@ instantiate_device_type_tests(TestCudnnConvolutionTransposeBackwardWeight, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_copy.py b/test/test_npu/test_network_ops/test_copy.py similarity index 99% rename from test/test_npu/test_copy.py rename to test/test_npu/test_network_ops/test_copy.py index 6b8e062e93f1a14ab549666bb84c6af8e5d72414..cf98bd4150d4c46173990ab6110bf3808883df42 100644 --- a/test/test_npu/test_copy.py +++ b/test/test_npu/test_network_ops/test_copy.py @@ -137,5 +137,4 @@ class TestCopy(TestCase): instantiate_device_type_tests(TestCopy, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() diff --git a/test/test_npu/test_cos.py b/test/test_npu/test_network_ops/test_cos.py similarity index 98% rename from test/test_npu/test_cos.py rename to test/test_npu/test_network_ops/test_cos.py index 6756d47f57c90f7d2484bf5656958c02180acc00..b247cd0a0f2d8de61375e800d1b7b724a82d0f17 100644 --- a/test/test_npu/test_cos.py +++ b/test/test_npu/test_network_ops/test_cos.py @@ -62,5 +62,4 @@ class TestCos(TestCase): instantiate_device_type_tests(TestCos, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_cosh.py b/test/test_npu/test_network_ops/test_cosh.py similarity index 99% rename from test/test_npu/test_cosh.py rename to test/test_npu/test_network_ops/test_cosh.py index 1ba58569b7543cd3b0cacb7508dec7a4f629e378..8ac14a077f5e7f7be6f3be5347d6f822e2aa3167 100644 --- a/test/test_npu/test_cosh.py +++ b/test/test_npu/test_network_ops/test_cosh.py @@ -145,5 +145,4 @@ class TestCosh(TestCase): instantiate_device_type_tests(TestCosh, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_cosinesimilarity.py b/test/test_npu/test_network_ops/test_cosinesimilarity.py similarity index 100% rename from test/test_npu/test_cosinesimilarity.py rename to test/test_npu/test_network_ops/test_cosinesimilarity.py diff --git a/test/test_npu/test_cross.py b/test/test_npu/test_network_ops/test_cross.py similarity index 100% rename from test/test_npu/test_cross.py rename to test/test_npu/test_network_ops/test_cross.py diff --git a/test/test_npu/test_cudnn_convolution_backward_bias.py b/test/test_npu/test_network_ops/test_cudnn_convolution_backward_bias.py similarity index 99% rename from test/test_npu/test_cudnn_convolution_backward_bias.py rename to test/test_npu/test_network_ops/test_cudnn_convolution_backward_bias.py index 6e274874701e6cbb40cf14d7515fd5941a6b6c57..51168f2b716310cc0bedddcf20bd0d3a4dd97ddf 100644 --- a/test/test_npu/test_cudnn_convolution_backward_bias.py +++ b/test/test_npu/test_network_ops/test_cudnn_convolution_backward_bias.py @@ -91,5 +91,4 @@ class TestCudnnConvolutionBackwardBias(TestCase): instantiate_device_type_tests(TestCudnnConvolutionBackwardBias, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() diff --git a/test/test_npu/test_cudnn_convolution_transpose_backward_bias.py b/test/test_npu/test_network_ops/test_cudnn_convolution_transpose_backward_bias.py similarity index 99% rename from test/test_npu/test_cudnn_convolution_transpose_backward_bias.py rename to test/test_npu/test_network_ops/test_cudnn_convolution_transpose_backward_bias.py index f5271d4197ac72fb5834481e3f74d22e90b78a29..aa912c86bb3329319ac0f7acb613a6ab382a9d60 100644 --- a/test/test_npu/test_cudnn_convolution_transpose_backward_bias.py +++ b/test/test_npu/test_network_ops/test_cudnn_convolution_transpose_backward_bias.py @@ -91,5 +91,4 @@ class TestCudnnConvolutionTransposeBackwardBias(TestCase): instantiate_device_type_tests(TestCudnnConvolutionTransposeBackwardBias, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() diff --git a/test/test_npu/test_cumprod.py b/test/test_npu/test_network_ops/test_cumprod.py similarity index 100% rename from test/test_npu/test_cumprod.py rename to test/test_npu/test_network_ops/test_cumprod.py diff --git a/test/test_npu/test_cumsum.py b/test/test_npu/test_network_ops/test_cumsum.py similarity index 100% rename from test/test_npu/test_cumsum.py rename to test/test_npu/test_network_ops/test_cumsum.py diff --git a/test/test_npu/test_dim_arange.py b/test/test_npu/test_network_ops/test_dim_arange.py similarity index 100% rename from test/test_npu/test_dim_arange.py rename to test/test_npu/test_network_ops/test_dim_arange.py diff --git a/test/test_npu/test_diml.py b/test/test_npu/test_network_ops/test_diml.py similarity index 100% rename from test/test_npu/test_diml.py rename to test/test_npu/test_network_ops/test_diml.py diff --git a/test/test_npu/test_dirichlet_grad.py b/test/test_npu/test_network_ops/test_dirichlet_grad.py similarity index 100% rename from test/test_npu/test_dirichlet_grad.py rename to test/test_npu/test_network_ops/test_dirichlet_grad.py diff --git a/test/test_npu/test_dot.py b/test/test_npu/test_network_ops/test_dot.py similarity index 99% rename from test/test_npu/test_dot.py rename to test/test_npu/test_network_ops/test_dot.py index 74edec125353e8555763d876fcd60f98c492f668..44b17ab5f7a62e43521623789d0ccc51c3d74d4e 100644 --- a/test/test_npu/test_dot.py +++ b/test/test_npu/test_network_ops/test_dot.py @@ -96,6 +96,5 @@ class TestDot(TestCase): instantiate_device_type_tests(TestDot, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() diff --git a/test/test_npu/test_elu.py b/test/test_npu/test_network_ops/test_elu.py similarity index 100% rename from test/test_npu/test_elu.py rename to test/test_npu/test_network_ops/test_elu.py diff --git a/test/test_npu/test_embedding.py b/test/test_npu/test_network_ops/test_embedding.py similarity index 100% rename from test/test_npu/test_embedding.py rename to test/test_npu/test_network_ops/test_embedding.py diff --git a/test/test_npu/test_equal.py b/test/test_npu/test_network_ops/test_equal.py similarity index 100% rename from test/test_npu/test_equal.py rename to test/test_npu/test_network_ops/test_equal.py diff --git a/test/test_npu/test_erf.py b/test/test_npu/test_network_ops/test_erf.py similarity index 99% rename from test/test_npu/test_erf.py rename to test/test_npu/test_network_ops/test_erf.py index 6cc76f3f5861885cc8a58244adf1f0fd8b30bb96..e35bdb796394b60351e05325de5839f3530341c0 100644 --- a/test/test_npu/test_erf.py +++ b/test/test_npu/test_network_ops/test_erf.py @@ -113,5 +113,4 @@ class TestErf(TestCase): instantiate_device_type_tests(TestErf, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_erfinv.py b/test/test_npu/test_network_ops/test_erfinv.py similarity index 99% rename from test/test_npu/test_erfinv.py rename to test/test_npu/test_network_ops/test_erfinv.py index 8eb7e68bfdd6ad87b91f6bc5cd16ecd7b0a8ecf3..4e033a4e85bdb57ad410ebc0c9e659fbe5bab059 100644 --- a/test/test_npu/test_erfinv.py +++ b/test/test_npu/test_network_ops/test_erfinv.py @@ -120,5 +120,4 @@ class TestErfinv(TestCase): instantiate_device_type_tests(TestErfinv, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() diff --git a/test/test_npu/test_expm1.py b/test/test_npu/test_network_ops/test_expm1.py similarity index 99% rename from test/test_npu/test_expm1.py rename to test/test_npu/test_network_ops/test_expm1.py index 52899245f82e934699b5cd9c513aa3bb9a6b5d8e..6b8cdafc746d2b10566a203323fc7a394bab27d6 100644 --- a/test/test_npu/test_expm1.py +++ b/test/test_npu/test_network_ops/test_expm1.py @@ -165,5 +165,4 @@ class TestExpm1(TestCase): instantiate_device_type_tests(TestExpm1, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_eye.py b/test/test_npu/test_network_ops/test_eye.py similarity index 99% rename from test/test_npu/test_eye.py rename to test/test_npu/test_network_ops/test_eye.py index e642baaa30063e78ca77bee5b26e2dc35c1c36df..03f4021cdae673a40b5ea13cda2bd435d6463523 100644 --- a/test/test_npu/test_eye.py +++ b/test/test_npu/test_network_ops/test_eye.py @@ -141,5 +141,4 @@ class TestEye(TestCase): instantiate_device_type_tests(TestEye, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_floordivide.py b/test/test_npu/test_network_ops/test_floordivide.py similarity index 100% rename from test/test_npu/test_floordivide.py rename to test/test_npu/test_network_ops/test_floordivide.py diff --git a/test/test_npu/test_frac.py b/test/test_npu/test_network_ops/test_frac.py similarity index 99% rename from test/test_npu/test_frac.py rename to test/test_npu/test_network_ops/test_frac.py index dcb781a8d36ba235fc2383921d6b1121c28bc71e..4929dba2892ebec58db5037c0c191c8451983870 100644 --- a/test/test_npu/test_frac.py +++ b/test/test_npu/test_network_ops/test_frac.py @@ -147,5 +147,4 @@ class TestFrac(TestCase): instantiate_device_type_tests(TestFrac, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_frobenius_norm.py b/test/test_npu/test_network_ops/test_frobenius_norm.py similarity index 99% rename from test/test_npu/test_frobenius_norm.py rename to test/test_npu/test_network_ops/test_frobenius_norm.py index 202974470b3381bf1816f8f87b1815cc64fea973..bd383adb49469aaee56e5706ce41700657b29e06 100644 --- a/test/test_npu/test_frobenius_norm.py +++ b/test/test_npu/test_network_ops/test_frobenius_norm.py @@ -172,5 +172,4 @@ class TestFrobenius_norm(TestCase): self.assertRtolEqual(cpu_output, npu_output) instantiate_device_type_tests(TestFrobenius_norm, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:7") run_tests() diff --git a/test/test_npu/test_full_like.py b/test/test_npu/test_network_ops/test_full_like.py similarity index 99% rename from test/test_npu/test_full_like.py rename to test/test_npu/test_network_ops/test_full_like.py index 36d5f6378f13c8b320d2afc2b826afdcb2e16d14..26b62c6ee121db582d3a23c28b8c1404682f306c 100644 --- a/test/test_npu/test_full_like.py +++ b/test/test_npu/test_network_ops/test_full_like.py @@ -84,5 +84,4 @@ class TestFullLike(TestCase): instantiate_device_type_tests(TestFullLike, globals(), except_for='cpu') if __name__ == '__main__': - torch.npu.set_device("npu:3") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_gelu.py b/test/test_npu/test_network_ops/test_gelu.py similarity index 100% rename from test/test_npu/test_gelu.py rename to test/test_npu/test_network_ops/test_gelu.py diff --git a/test/test_npu/test_glu.py b/test/test_npu/test_network_ops/test_glu.py similarity index 98% rename from test/test_npu/test_glu.py rename to test/test_npu/test_network_ops/test_glu.py index 85167f0ab8d97ee3196b61c044bc2926775359fc..073ad2deb620f52127eb81f6143004a359a2beb0 100644 --- a/test/test_npu/test_glu.py +++ b/test/test_npu/test_network_ops/test_glu.py @@ -77,5 +77,4 @@ class TestGlu(TestCase): instantiate_device_type_tests(TestGlu, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_glugrad.py b/test/test_npu/test_network_ops/test_glugrad.py similarity index 99% rename from test/test_npu/test_glugrad.py rename to test/test_npu/test_network_ops/test_glugrad.py index c2e546bd28907330c7c04ee0234845b91d94dd16..4050bca25c8ee89bc1be24555f2f4566170b9c11 100644 --- a/test/test_npu/test_glugrad.py +++ b/test/test_npu/test_network_ops/test_glugrad.py @@ -87,5 +87,4 @@ class TestGluGrad(TestCase): instantiate_device_type_tests(TestGluGrad, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() diff --git a/test/test_npu/test_grid_sampler_2d.py b/test/test_npu/test_network_ops/test_grid_sampler_2d.py similarity index 98% rename from test/test_npu/test_grid_sampler_2d.py rename to test/test_npu/test_network_ops/test_grid_sampler_2d.py index 655f548aedc5496ac84c7d8b5bf2f77d0561df75..50d6ed9bf0e3628718b06058e7790d2fb77516cf 100644 --- a/test/test_npu/test_grid_sampler_2d.py +++ b/test/test_npu/test_network_ops/test_grid_sampler_2d.py @@ -70,5 +70,4 @@ class TestGridSampler2D(TestCase): instantiate_device_type_tests(TestGridSampler2D, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_grid_sampler_2d_backward.py b/test/test_npu/test_network_ops/test_grid_sampler_2d_backward.py similarity index 98% rename from test/test_npu/test_grid_sampler_2d_backward.py rename to test/test_npu/test_network_ops/test_grid_sampler_2d_backward.py index f5ca5d00307c39de699376b92b643639ff59ba97..0cbfb2721e358a4313abdbcbf7c70f6ac0aca2cc 100644 --- a/test/test_npu/test_grid_sampler_2d_backward.py +++ b/test/test_npu/test_network_ops/test_grid_sampler_2d_backward.py @@ -74,5 +74,4 @@ class TestGridSampler2dBackward(TestCase): instantiate_device_type_tests(TestGridSampler2dBackward, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:4") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_group_norm.py b/test/test_npu/test_network_ops/test_group_norm.py similarity index 99% rename from test/test_npu/test_group_norm.py rename to test/test_npu/test_network_ops/test_group_norm.py index 3a326b779bba49b2dad7aa7ea0c4ad2983726203..d6c1bd1029bc1e0175303f2473851e90e21ee86c 100644 --- a/test/test_npu/test_group_norm.py +++ b/test/test_npu/test_network_ops/test_group_norm.py @@ -129,5 +129,4 @@ class TestGroupNormExt(TestCase): instantiate_device_type_tests(TestGroupNormExt, globals(), except_for='cpu') if __name__ == '__main__': - torch.npu.set_device("npu:1") run_tests() diff --git a/test/test_npu/test_hamming_window.py b/test/test_npu/test_network_ops/test_hamming_window.py similarity index 99% rename from test/test_npu/test_hamming_window.py rename to test/test_npu/test_network_ops/test_hamming_window.py index 490cf878cbf4d371ee973cc69a02dc9fb1eba8a5..da429c640ec0357c4e477c8ca1c4f9f96b1acf73 100644 --- a/test/test_npu/test_hamming_window.py +++ b/test/test_npu/test_network_ops/test_hamming_window.py @@ -130,5 +130,4 @@ class TestHammingWindow(TestCase): instantiate_device_type_tests(TestHammingWindow, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_hammingwindow.py b/test/test_npu/test_network_ops/test_hammingwindow.py similarity index 100% rename from test/test_npu/test_hammingwindow.py rename to test/test_npu/test_network_ops/test_hammingwindow.py diff --git a/test/test_npu/test_hanning_window.py b/test/test_npu/test_network_ops/test_hanning_window.py similarity index 98% rename from test/test_npu/test_hanning_window.py rename to test/test_npu/test_network_ops/test_hanning_window.py index 30fe1d86a03c78980c96b1d6c6da07df572f8736..16b15caa8687c244ebfb41e45f00119980276ad8 100644 --- a/test/test_npu/test_hanning_window.py +++ b/test/test_npu/test_network_ops/test_hanning_window.py @@ -83,5 +83,4 @@ class TestHannWindow(TestCase): instantiate_device_type_tests(TestHannWindow, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_hard_sigmoid_backward.py b/test/test_npu/test_network_ops/test_hard_sigmoid_backward.py similarity index 100% rename from test/test_npu/test_hard_sigmoid_backward.py rename to test/test_npu/test_network_ops/test_hard_sigmoid_backward.py diff --git a/test/test_npu/test_hardshrink.py b/test/test_npu/test_network_ops/test_hardshrink.py similarity index 100% rename from test/test_npu/test_hardshrink.py rename to test/test_npu/test_network_ops/test_hardshrink.py diff --git a/test/test_npu/test_hardshrink_backward.py b/test/test_npu/test_network_ops/test_hardshrink_backward.py similarity index 100% rename from test/test_npu/test_hardshrink_backward.py rename to test/test_npu/test_network_ops/test_hardshrink_backward.py diff --git a/test/test_npu/test_hardsigmoid.py b/test/test_npu/test_network_ops/test_hardsigmoid.py similarity index 100% rename from test/test_npu/test_hardsigmoid.py rename to test/test_npu/test_network_ops/test_hardsigmoid.py diff --git a/test/test_npu/test_hinge_embedding_loss.py b/test/test_npu/test_network_ops/test_hinge_embedding_loss.py similarity index 100% rename from test/test_npu/test_hinge_embedding_loss.py rename to test/test_npu/test_network_ops/test_hinge_embedding_loss.py diff --git a/test/test_npu/test_im2col.py b/test/test_npu/test_network_ops/test_im2col.py similarity index 100% rename from test/test_npu/test_im2col.py rename to test/test_npu/test_network_ops/test_im2col.py diff --git a/test/test_npu/test_index_fill_d.py b/test/test_npu/test_network_ops/test_index_fill_d.py similarity index 100% rename from test/test_npu/test_index_fill_d.py rename to test/test_npu/test_network_ops/test_index_fill_d.py diff --git a/test/test_npu/test_index_select.py b/test/test_npu/test_network_ops/test_index_select.py similarity index 100% rename from test/test_npu/test_index_select.py rename to test/test_npu/test_network_ops/test_index_select.py diff --git a/test/test_npu/test_isclose.py b/test/test_npu/test_network_ops/test_isclose.py similarity index 100% rename from test/test_npu/test_isclose.py rename to test/test_npu/test_network_ops/test_isclose.py diff --git a/test/test_npu/test_kthvalue.py b/test/test_npu/test_network_ops/test_kthvalue.py similarity index 99% rename from test/test_npu/test_kthvalue.py rename to test/test_npu/test_network_ops/test_kthvalue.py index 56841fc05d473e0e843204fa4bace92aeca69ac1..35a4a4a9c53731c99cf837b8fcd54819f7607e95 100644 --- a/test/test_npu/test_kthvalue.py +++ b/test/test_npu/test_network_ops/test_kthvalue.py @@ -177,7 +177,6 @@ class TestKthvalues(TestCase): instantiate_device_type_tests(TestKthvalues, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() diff --git a/test/test_npu/test_l1_loss.py b/test/test_npu/test_network_ops/test_l1_loss.py similarity index 100% rename from test/test_npu/test_l1_loss.py rename to test/test_npu/test_network_ops/test_l1_loss.py diff --git a/test/test_npu/test_l1_loss_backward.py b/test/test_npu/test_network_ops/test_l1_loss_backward.py similarity index 100% rename from test/test_npu/test_l1_loss_backward.py rename to test/test_npu/test_network_ops/test_l1_loss_backward.py diff --git a/test/test_npu/test_leaky_relu.py b/test/test_npu/test_network_ops/test_leaky_relu.py similarity index 99% rename from test/test_npu/test_leaky_relu.py rename to test/test_npu/test_network_ops/test_leaky_relu.py index 78de88510ea2ccdacfa82a73d31810cba8b4cf94..67f2bc7fd4434312f676369416c805e85f7b11c4 100644 --- a/test/test_npu/test_leaky_relu.py +++ b/test/test_npu/test_network_ops/test_leaky_relu.py @@ -109,7 +109,6 @@ class TestLeakRelu(TestCase): instantiate_device_type_tests(TestLeakRelu, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_network_ops/test_log10.py b/test/test_npu/test_network_ops/test_log10.py new file mode 100644 index 0000000000000000000000000000000000000000..c3cc4226ae48d2e5aeaecbdb5e3ad7ddfd28804c --- /dev/null +++ b/test/test_npu/test_network_ops/test_log10.py @@ -0,0 +1,183 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import sys +import copy +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + + +class TestLog10(TestCase): + def cpu_op_exec(self, input1): + output = torch.log10(input1) + output = output.numpy() + return output + + def npu_op_exec(self, input1): + output = torch.log10(input1) + output = output.to("cpu").numpy() + return output + + def npu_op_exec_out(self, input1, input2): + torch.log10(input1, out=input2) + output = input2.to("cpu").numpy() + return output + + def cpu_inp_op_exec(self, input1): + output = torch.log10_(input1) + output = output.numpy() + return output + + def npu_inp_op_exec(self, input1): + torch.log10_(input1) + output = input1.to("cpu").numpy() + return output + + def cpu_inp_uncon_op_exec(self, input1): + input1 = input1.as_strided([2, 2], [1, 2], 2) + output = torch.log10_(input1) + output = output.numpy() + return output + + def npu_inp_uncon_op_exec(self, input1): + input1 = input1.as_strided([2, 2], [1, 2], 2) + torch.log10_(input1) + output = input1.to("cpu").numpy() + return output + + def test_log10_shape_format_fp32(self, device): + format_list = [3] + shape_list = [(4, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_output = self.cpu_op_exec(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_log10_shape_format_fp16(self, device): + format_list = [3] + shape_list = [(4, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input1) + npu_output = self.npu_op_exec(npu_input1) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_log10_inp_shape_format_fp32(self, device): + format_list = [3] + shape_list = [(4, 4)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_output = self.cpu_inp_op_exec(cpu_input1) + npu_output = self.npu_inp_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_log10_inp_shape_format_fp16(self, device): + format_list = [3] + shape_list = [(4, 4)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_output = self.cpu_inp_op_exec(cpu_input1) + npu_output = self.npu_inp_op_exec(npu_input1) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_log10_inp_uncon_shape_format_fp32(self, device): + format_list = [3] + shape_list = [(8, 6)] + shape_format = [ + [np.float32, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1) + npu_output = self.npu_inp_uncon_op_exec(npu_input1) + self.assertRtolEqual(cpu_output, npu_output) + + def test_log10_inp_uncon_shape_format_fp16(self, device): + format_list = [3] + shape_list = [(8, 6)] + shape_format = [ + [np.float16, i, j] for i in format_list for j in shape_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1) + npu_output = self.npu_inp_uncon_op_exec(npu_input1) + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + + def test_log10_out_float32_shape_format(self, device): + shape_format = [ + [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]], + [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]], + [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]], + [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]], + [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]], + [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]], + [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]], + [[np.float32, 3, [1024]], [np.float32, 3, [1024]]], + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 100) + cpu_output, npu_output = create_common_tensor(item[1], 0, 100) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec_out(npu_input, npu_output) + self.assertRtolEqual(cpu_output, npu_output) + + def test_log10_out_float16_shape_format(self, device): + shape_format = [ + [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]], + [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]], + [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]], + [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]], + [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]], + [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]], + [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]], + [[np.float16, 3, [1024]], [np.float16, 3, [1024]]], + ] + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 100) + cpu_output, npu_output = create_common_tensor(item[1], 0, 100) + if item[0][0] == np.float16: + cpu_input = cpu_input.to(torch.float32) + cpu_output = cpu_output.to(torch.float32) + cpu_output = self.cpu_op_exec(cpu_input) + npu_output = self.npu_op_exec_out(npu_input, npu_output) + if item[0][0] == np.float16: + cpu_output = cpu_output.astype(np.float16) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestLog10, globals(), except_for="cpu") +if __name__ == '__main__': + run_tests() diff --git a/test/test_npu/test_network_ops/test_log1p.py b/test/test_npu/test_network_ops/test_log1p.py index 40486f60d97352ad66283198d742bf017dae3730..75f4c7ac88fbe4e40ebb2cfe8212a189df908c4b 100644 --- a/test/test_npu/test_network_ops/test_log1p.py +++ b/test/test_npu/test_network_ops/test_log1p.py @@ -92,6 +92,5 @@ class TestLog1p(TestCase): instantiate_device_type_tests(TestLog1p, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() diff --git a/test/test_npu/test_log_sigmoid.py b/test/test_npu/test_network_ops/test_log_sigmoid.py similarity index 98% rename from test/test_npu/test_log_sigmoid.py rename to test/test_npu/test_network_ops/test_log_sigmoid.py index b039f77522aeb20df58e0097b9e023bb6e31e445..e2c0ffdfeac1a7472508a4c792a594543b81935f 100644 --- a/test/test_npu/test_log_sigmoid.py +++ b/test/test_npu/test_network_ops/test_log_sigmoid.py @@ -70,5 +70,4 @@ class TestLogsigmoid(TestCase): instantiate_device_type_tests(TestLogsigmoid, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() diff --git a/test/test_npu/test_log_sigmoid_backward.py b/test/test_npu/test_network_ops/test_log_sigmoid_backward.py similarity index 98% rename from test/test_npu/test_log_sigmoid_backward.py rename to test/test_npu/test_network_ops/test_log_sigmoid_backward.py index b94e607a2eb2f750166e174cf000974583b61d30..63ae01b4ff0cdde70005886ca5199a724e2f1c75 100644 --- a/test/test_npu/test_log_sigmoid_backward.py +++ b/test/test_npu/test_network_ops/test_log_sigmoid_backward.py @@ -92,5 +92,4 @@ class TestLogSigmoidBackward(TestCase): instantiate_device_type_tests( TestLogSigmoidBackward, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() diff --git a/test/test_npu/test_logical_not.py b/test/test_npu/test_network_ops/test_logical_not.py similarity index 98% rename from test/test_npu/test_logical_not.py rename to test/test_npu/test_network_ops/test_logical_not.py index 865cdf073a66a280cff6159a4af8e1036f81b4ea..031e206dd2494e900a1341b88404aeacaced5183 100644 --- a/test/test_npu/test_logical_not.py +++ b/test/test_npu/test_network_ops/test_logical_not.py @@ -63,6 +63,5 @@ class TestLogicalNot(TestCase): instantiate_device_type_tests(TestLogicalNot, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() diff --git a/test/test_npu/test_logsigmoid.py b/test/test_npu/test_network_ops/test_logsigmoid.py similarity index 98% rename from test/test_npu/test_logsigmoid.py rename to test/test_npu/test_network_ops/test_logsigmoid.py index d7732766935feb421513a54b5b8b95db61d9245a..6c02c6fa756b136771672e82b93a0ef35118184a 100644 --- a/test/test_npu/test_logsigmoid.py +++ b/test/test_npu/test_network_ops/test_logsigmoid.py @@ -53,5 +53,4 @@ class TestLogsigmoid(TestCase): instantiate_device_type_tests(TestLogsigmoid, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_logsigmoidforward.py b/test/test_npu/test_network_ops/test_logsigmoidforward.py similarity index 98% rename from test/test_npu/test_logsigmoidforward.py rename to test/test_npu/test_network_ops/test_logsigmoidforward.py index ae072d622a2371fa3d7c32ac3f296556aa2311fc..ae1bd6ade46748a1311ee5d28188dd5293283ae1 100644 --- a/test/test_npu/test_logsigmoidforward.py +++ b/test/test_npu/test_network_ops/test_logsigmoidforward.py @@ -67,5 +67,4 @@ class TestLogsigmoidForward(TestCase): instantiate_device_type_tests(TestLogsigmoidForward, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_logsumexp.py b/test/test_npu/test_network_ops/test_logsumexp.py similarity index 100% rename from test/test_npu/test_logsumexp.py rename to test/test_npu/test_network_ops/test_logsumexp.py diff --git a/test/test_npu/test_network_ops/test_masked_fill_range.py b/test/test_npu/test_network_ops/test_masked_fill_range.py new file mode 100644 index 0000000000000000000000000000000000000000..69cd04dc1ff98263d0f1da7df4a3920b4463564d --- /dev/null +++ b/test/test_npu/test_network_ops/test_masked_fill_range.py @@ -0,0 +1,101 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from torch.cuda import device +import torch +import numpy as np +import copy +import sys +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + + +class TestMaskedFillRange(TestCase): + def cpu_op_exec(self, input1, start, end, value, axis, dim): + out = input1.clone() + start_shape = start.shape + if dim == 1: + for i in range(0, start_shape[0]): + for j in range(0, start_shape[1]): + for k in range(start[i, j], end[i, j]): + out[k] = value[i] + if dim == 2: + for i in range(0, start_shape[0]): + for j in range(0, start_shape[1]): + for k in range(start[i, j], end[i, j]): + if axis == 0: + out[k, :] = value[i] + else: + out[j, k] = value[i] + if dim == 3: + for i in range(0, start_shape[0]): + for j in range(0, start_shape[1]): + for k in range(start[i, j], end[i, j]): + if axis == 0: + out[k, :, :] = value[i] + elif axis == 1: + out[:, k, :] = value[i] + else: + out[j, :, k] = value[i] + return out + + def npu_op_exec(self, input1, start, end, value, axis): + out = torch.npu_masked_fill_range(input1, start, end, value, axis) + out = out.to("cpu") + return out.detach().numpy() + + def test_normalize_batch(self, device): + # TODO(ascend): 该算子还存在泛化问题, 目前保证模型场景没问题 + # Note: 以下为模型用例:测试通过 + shape_format = [ + [[np.float32, -1, [32, 64, 1688]], + [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]], + [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.float32], 2], + [[np.float16, -1, [32, 64, 1688]], + [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]], + [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.float16], 2], + [[np.int32, -1, [32, 64, 1688]], + [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]], + [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.int32], 2], + [[np.int8, -1, [32, 64, 1688]], + [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]], + [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.int8], 2], + ] + for item in shape_format: + axis = item[-1] + cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100) + shape = item[0][-1] + cpu_start = torch.tensor(item[1], dtype=torch.int32) + npu_start = cpu_start.npu() + cpu_end = torch.tensor(item[2], dtype=torch.int32) + npu_end = cpu_end.npu() + cpu_value = torch.tensor(item[3][0], dtype=item[3][1]) + npu_value = cpu_value.npu() + cpu_output = self.cpu_op_exec(cpu_input1, cpu_start, cpu_end, cpu_value, axis, len(shape)) + npu_output = self.npu_op_exec(npu_input1, npu_start, npu_end, npu_value, axis) + cpu_output = cpu_output.numpy() + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestMaskedFillRange, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() diff --git a/test/test_npu/test_max_pool2d_backward.py b/test/test_npu/test_network_ops/test_max_pool2d_backward.py similarity index 100% rename from test/test_npu/test_max_pool2d_backward.py rename to test/test_npu/test_network_ops/test_max_pool2d_backward.py diff --git a/test/test_npu/test_miopen_batch_norm.py b/test/test_npu/test_network_ops/test_miopen_batch_norm.py similarity index 100% rename from test/test_npu/test_miopen_batch_norm.py rename to test/test_npu/test_network_ops/test_miopen_batch_norm.py diff --git a/test/test_npu/test_miopen_batch_norm_backward.py b/test/test_npu/test_network_ops/test_miopen_batch_norm_backward.py similarity index 100% rename from test/test_npu/test_miopen_batch_norm_backward.py rename to test/test_npu/test_network_ops/test_miopen_batch_norm_backward.py diff --git a/test/test_npu/test_miopen_convolution.py b/test/test_npu/test_network_ops/test_miopen_convolution.py similarity index 99% rename from test/test_npu/test_miopen_convolution.py rename to test/test_npu/test_network_ops/test_miopen_convolution.py index 8583f61d49fd4ab3d414a4392b4c0114a76b8c79..0e8267c72ead13e376dc62a6fa6c0b0f004b2872 100644 --- a/test/test_npu/test_miopen_convolution.py +++ b/test/test_npu/test_network_ops/test_miopen_convolution.py @@ -77,5 +77,4 @@ class TestMiopenConvolution(TestCase): instantiate_device_type_tests(TestMiopenConvolution, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_miopen_convolution_backward.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward.py similarity index 99% rename from test/test_npu/test_miopen_convolution_backward.py rename to test/test_npu/test_network_ops/test_miopen_convolution_backward.py index 0aaa54c061bf2e3fd3d4d38412674c1432ea7b1b..07a4d88d2ef2efc3e4635743fcf9cf8160ba8e63 100644 --- a/test/test_npu/test_miopen_convolution_backward.py +++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward.py @@ -120,5 +120,4 @@ class TestMiopenConvolutionBackward(TestCase): instantiate_device_type_tests(TestMiopenConvolutionBackward, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_miopen_convolution_backward_bias.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward_bias.py similarity index 99% rename from test/test_npu/test_miopen_convolution_backward_bias.py rename to test/test_npu/test_network_ops/test_miopen_convolution_backward_bias.py index 00259de601a33c79e18a527dc1c503c3950844d6..8d92d5f9b6d0559f46aa500e799ff216087d8a3b 100644 --- a/test/test_npu/test_miopen_convolution_backward_bias.py +++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward_bias.py @@ -114,5 +114,4 @@ class TestMiopenConvolutionBackwardBias(TestCase): instantiate_device_type_tests(TestMiopenConvolutionBackwardBias, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_miopen_convolution_backward_input.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward_input.py similarity index 99% rename from test/test_npu/test_miopen_convolution_backward_input.py rename to test/test_npu/test_network_ops/test_miopen_convolution_backward_input.py index 63e1282dc7d48f0ef944129637c947bfc0fb70a2..94da01163d9a15ae4e2294ac92c9f5e451f72d55 100644 --- a/test/test_npu/test_miopen_convolution_backward_input.py +++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward_input.py @@ -114,5 +114,4 @@ class TestMiopenConvolutionBackwardInput(TestCase): instantiate_device_type_tests(TestMiopenConvolutionBackwardInput, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_miopen_convolution_backward_weight.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward_weight.py similarity index 99% rename from test/test_npu/test_miopen_convolution_backward_weight.py rename to test/test_npu/test_network_ops/test_miopen_convolution_backward_weight.py index 64dca8cbff986c77839ed5f14726bfcf44cc0f37..4943a6efe47472bdf17da761ae69050785ecba2f 100644 --- a/test/test_npu/test_miopen_convolution_backward_weight.py +++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward_weight.py @@ -115,6 +115,5 @@ class TestMiopenConvolutionBackwardWeight(TestCase): instantiate_device_type_tests(TestMiopenConvolutionBackwardWeight, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_mkldnn_adaptive_avg_pool2d.py b/test/test_npu/test_network_ops/test_mkldnn_adaptive_avg_pool2d.py similarity index 98% rename from test/test_npu/test_mkldnn_adaptive_avg_pool2d.py rename to test/test_npu/test_network_ops/test_mkldnn_adaptive_avg_pool2d.py index 554f8ba38c182e3eb69a705aa957493f310a35a9..c0edc4c37db7c50a9dc1322944d0f880b7b09de0 100644 --- a/test/test_npu/test_mkldnn_adaptive_avg_pool2d.py +++ b/test/test_npu/test_network_ops/test_mkldnn_adaptive_avg_pool2d.py @@ -77,5 +77,4 @@ class TestMkldnnAdaptiveAvgPool2d(TestCase): instantiate_device_type_tests(TestMkldnnAdaptiveAvgPool2d, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_mkldnn_convolution_backward.py b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward.py similarity index 99% rename from test/test_npu/test_mkldnn_convolution_backward.py rename to test/test_npu/test_network_ops/test_mkldnn_convolution_backward.py index 8de7467f58f72343da91e45ee262eb460d63e7c6..f298d09fd973677689b0b0068fa8802ad23aafd0 100644 --- a/test/test_npu/test_mkldnn_convolution_backward.py +++ b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward.py @@ -159,5 +159,4 @@ class TestMkldnnConvolutionBackward(TestCase): instantiate_device_type_tests(TestMkldnnConvolutionBackward, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_mkldnn_convolution_backward_input.py b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_input.py similarity index 99% rename from test/test_npu/test_mkldnn_convolution_backward_input.py rename to test/test_npu/test_network_ops/test_mkldnn_convolution_backward_input.py index 7a90b52bc48ff601bbc6b33097469142d8df1104..7ee1961f5e1c958bad399c1de92c9cf38e8d6f1f 100644 --- a/test/test_npu/test_mkldnn_convolution_backward_input.py +++ b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_input.py @@ -150,5 +150,4 @@ class TestMkldnnConvolutionBackwardInput(TestCase): instantiate_device_type_tests(TestMkldnnConvolutionBackwardInput, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() diff --git a/test/test_npu/test_mkldnn_convolution_backward_weights.py b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_weights.py similarity index 99% rename from test/test_npu/test_mkldnn_convolution_backward_weights.py rename to test/test_npu/test_network_ops/test_mkldnn_convolution_backward_weights.py index 5bf471a52c59994e0667e4416771282d0129a405..65e0bf8abda4b4a2fc9b659eefb115eb0cd66183 100644 --- a/test/test_npu/test_mkldnn_convolution_backward_weights.py +++ b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_weights.py @@ -157,5 +157,4 @@ class TestMkldnnConvolutionBackwardWeights(TestCase): instantiate_device_type_tests(TestMkldnnConvolutionBackwardWeights, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_multilabel_margin_loss.py b/test/test_npu/test_network_ops/test_multilabel_margin_loss.py similarity index 100% rename from test/test_npu/test_multilabel_margin_loss.py rename to test/test_npu/test_network_ops/test_multilabel_margin_loss.py diff --git a/test/test_npu/test_multinomial.py b/test/test_npu/test_network_ops/test_multinomial.py similarity index 100% rename from test/test_npu/test_multinomial.py rename to test/test_npu/test_network_ops/test_multinomial.py diff --git a/test/test_npu/test_narrow_copy.py b/test/test_npu/test_network_ops/test_narrow_copy.py similarity index 100% rename from test/test_npu/test_narrow_copy.py rename to test/test_npu/test_network_ops/test_narrow_copy.py diff --git a/test/test_npu/test_nllloss2d.py b/test/test_npu/test_network_ops/test_nllloss2d.py similarity index 100% rename from test/test_npu/test_nllloss2d.py rename to test/test_npu/test_network_ops/test_nllloss2d.py diff --git a/test/test_npu/test_norm_except_dim.py b/test/test_npu/test_network_ops/test_norm_except_dim.py similarity index 96% rename from test/test_npu/test_norm_except_dim.py rename to test/test_npu/test_network_ops/test_norm_except_dim.py index c1555ee23a99d961b756563b8f23a0320296c34d..f75c7cc6ef0dbe6b68f12f1c5d1db311a4cff1d4 100644 --- a/test/test_npu/test_norm_except_dim.py +++ b/test/test_npu/test_network_ops/test_norm_except_dim.py @@ -17,7 +17,7 @@ import numpy as np import math import random from torch._six import nan -from common_utils import TestCase, iter_indices, run_tests +from common_utils import TestCase, run_tests from common_device_type import dtypes, instantiate_device_type_tests @@ -73,5 +73,4 @@ class TestNormExceptDim(TestCase): instantiate_device_type_tests(TestNormExceptDim, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_norm_ext.py b/test/test_npu/test_network_ops/test_norm_ext.py similarity index 96% rename from test/test_npu/test_norm_ext.py rename to test/test_npu/test_network_ops/test_norm_ext.py index bf3aac19f9f8c1ab8e7882d3733448f75296582e..8e5e51224cf3493e38b3b1e0fb40300874c678eb 100644 --- a/test/test_npu/test_norm_ext.py +++ b/test/test_npu/test_network_ops/test_norm_ext.py @@ -73,7 +73,7 @@ class TestNorm(TestCase): def test_norm_shape_format_2(self, device): shape_format = [ - [[np.float16, 0, (12, 33)]], + # [[np.float16, 0, (12, 33)]], # result error [[np.float32, 0, (12, 33)]], ] for item in shape_format: @@ -82,12 +82,12 @@ class TestNorm(TestCase): cpu_input = cpu_input.to(torch.float32) cpu_output = self.cpu_out_exec(cpu_input, 2, [0], False, torch.float) npu_output = self.npu_out_exec(npu_input, 2, [0], False, torch.float) - cpu_output = cpu_output.to(npu_output.dtype) + npu_output = npu_output.to(cpu_output.dtype) self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy()) def test_norm_shape_format_3(self, device): shape_format = [ - [[np.float16, 0, (10, 24, 56, 2048)]], + # [[np.float16, 0, (10, 24, 56, 2048)]], # result error [[np.float32, 0, (10, 24, 56, 2048)]], ] for item in shape_format: @@ -127,5 +127,4 @@ class TestNorm(TestCase): instantiate_device_type_tests(TestNorm, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_one_hot.py b/test/test_npu/test_network_ops/test_one_hot.py similarity index 98% rename from test/test_npu/test_one_hot.py rename to test/test_npu/test_network_ops/test_one_hot.py index f9d69381841b95c917f9e0d48e7930aa9d7231ce..ce72ccb38b2333f5501d65681e7cacdc5897bce5 100644 --- a/test/test_npu/test_one_hot.py +++ b/test/test_npu/test_network_ops/test_one_hot.py @@ -89,6 +89,5 @@ class TestOneHot(TestCase): instantiate_device_type_tests(TestOneHot, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() diff --git a/test/test_npu/test_ones.py b/test/test_npu/test_network_ops/test_ones.py similarity index 100% rename from test/test_npu/test_ones.py rename to test/test_npu/test_network_ops/test_ones.py diff --git a/test/test_npu/test_pdist.py b/test/test_npu/test_network_ops/test_pdist.py similarity index 99% rename from test/test_npu/test_pdist.py rename to test/test_npu/test_network_ops/test_pdist.py index 6fc2dd65a601d05dfe48a57507179fa6ef4c6a19..03e2fbfe1daafd1461fc0c669bbad4eccc173db8 100644 --- a/test/test_npu/test_pdist.py +++ b/test/test_npu/test_network_ops/test_pdist.py @@ -139,5 +139,4 @@ class TestPdist(TestCase): instantiate_device_type_tests(TestPdist, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_pixel_shuffle.py b/test/test_npu/test_network_ops/test_pixel_shuffle.py similarity index 99% rename from test/test_npu/test_pixel_shuffle.py rename to test/test_npu/test_network_ops/test_pixel_shuffle.py index fa35bae0802c0fa438ff28b1c59f9a2bf5cec410..281f10e3c6cac75b9d4344bff34d4e0c980a0e9e 100644 --- a/test/test_npu/test_pixel_shuffle.py +++ b/test/test_npu/test_network_ops/test_pixel_shuffle.py @@ -91,5 +91,4 @@ class TestPixel_shuffle(TestCase): instantiate_device_type_tests(TestPixel_shuffle, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:7") run_tests() diff --git a/test/test_npu/test_network_ops/test_pooling.py b/test/test_npu/test_network_ops/test_pooling.py index fd8cd7d0daf17cb6c0cc75ef09e81de8db1ea5dc..9aa54b3839f98e9ef37acdbf3e72599be44fa8ba 100644 --- a/test/test_npu/test_network_ops/test_pooling.py +++ b/test/test_npu/test_network_ops/test_pooling.py @@ -53,5 +53,4 @@ class TestPooling(TestCase): instantiate_device_type_tests(TestPooling, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() diff --git a/test/test_npu/test_prelu.py b/test/test_npu/test_network_ops/test_prelu.py similarity index 98% rename from test/test_npu/test_prelu.py rename to test/test_npu/test_network_ops/test_prelu.py index 9b4079dd87edf26e113352347e47cc3945414008..da643db7ad860f440952f2d55f63ccd5da4827d0 100644 --- a/test/test_npu/test_prelu.py +++ b/test/test_npu/test_network_ops/test_prelu.py @@ -57,5 +57,4 @@ class TestPrelu(TestCase): instantiate_device_type_tests(TestPrelu, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_prelu_backward.py b/test/test_npu/test_network_ops/test_prelu_backward.py similarity index 99% rename from test/test_npu/test_prelu_backward.py rename to test/test_npu/test_network_ops/test_prelu_backward.py index d058a0616587b197ffb7cfd023332325cedcc7ed..07a5e9d64859b7d1412fffc38c6a86451d8c08d2 100644 --- a/test/test_npu/test_prelu_backward.py +++ b/test/test_npu/test_network_ops/test_prelu_backward.py @@ -88,5 +88,4 @@ class TestPreluBackward(TestCase): instantiate_device_type_tests(TestPreluBackward, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_quantize_per_channel.py b/test/test_npu/test_network_ops/test_quantize_per_channel.py similarity index 100% rename from test/test_npu/test_quantize_per_channel.py rename to test/test_npu/test_network_ops/test_quantize_per_channel.py diff --git a/test/test_npu/test_quantize_per_tensor.py b/test/test_npu/test_network_ops/test_quantize_per_tensor.py similarity index 100% rename from test/test_npu/test_quantize_per_tensor.py rename to test/test_npu/test_network_ops/test_quantize_per_tensor.py diff --git a/test/test_npu/test_real.py b/test/test_npu/test_network_ops/test_real.py similarity index 100% rename from test/test_npu/test_real.py rename to test/test_npu/test_network_ops/test_real.py diff --git a/test/test_npu/test_renorm.py b/test/test_npu/test_network_ops/test_renorm.py similarity index 99% rename from test/test_npu/test_renorm.py rename to test/test_npu/test_network_ops/test_renorm.py index a1c258f913ab5b59e839a20f7cbcfcf9d92f73d7..13cedf07d8effbd73a16410582b2e0bae1bfe8f9 100644 --- a/test/test_npu/test_renorm.py +++ b/test/test_npu/test_network_ops/test_renorm.py @@ -269,5 +269,4 @@ class TestRenorm(TestCase): instantiate_device_type_tests(TestRenorm, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:0") run_tests() diff --git a/test/test_npu/test_repeat_interleave.py b/test/test_npu/test_network_ops/test_repeat_interleave.py similarity index 100% rename from test/test_npu/test_repeat_interleave.py rename to test/test_npu/test_network_ops/test_repeat_interleave.py diff --git a/test/test_npu/test_network_ops/test_resize_.py b/test/test_npu/test_network_ops/test_resize_.py index b525ead0055c2fca80e9587ee003dfc18397cb04..bafaff5300e2777698a8fbc57f1e899c1f305bd9 100644 --- a/test/test_npu/test_network_ops/test_resize_.py +++ b/test/test_npu/test_network_ops/test_resize_.py @@ -66,5 +66,4 @@ class TestResize(TestCase): instantiate_device_type_tests(TestResize, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_roll.py b/test/test_npu/test_network_ops/test_roll.py similarity index 100% rename from test/test_npu/test_roll.py rename to test/test_npu/test_network_ops/test_roll.py diff --git a/test/test_npu/test_selu.py b/test/test_npu/test_network_ops/test_selu.py similarity index 100% rename from test/test_npu/test_selu.py rename to test/test_npu/test_network_ops/test_selu.py diff --git a/test/test_npu/test_sinh.py b/test/test_npu/test_network_ops/test_sinh.py similarity index 100% rename from test/test_npu/test_sinh.py rename to test/test_npu/test_network_ops/test_sinh.py diff --git a/test/test_npu/test_slow_conv_dilated2d.py b/test/test_npu/test_network_ops/test_slow_conv_dilated2d.py similarity index 100% rename from test/test_npu/test_slow_conv_dilated2d.py rename to test/test_npu/test_network_ops/test_slow_conv_dilated2d.py diff --git a/test/test_npu/test_slow_conv_dilated2d_backward.py b/test/test_npu/test_network_ops/test_slow_conv_dilated2d_backward.py similarity index 100% rename from test/test_npu/test_slow_conv_dilated2d_backward.py rename to test/test_npu/test_network_ops/test_slow_conv_dilated2d_backward.py diff --git a/test/test_npu/test_slow_conv_transpose2d.py b/test/test_npu/test_network_ops/test_slow_conv_transpose2d.py similarity index 100% rename from test/test_npu/test_slow_conv_transpose2d.py rename to test/test_npu/test_network_ops/test_slow_conv_transpose2d.py diff --git a/test/test_npu/test_slow_conv_transpose2d_backward.py b/test/test_npu/test_network_ops/test_slow_conv_transpose2d_backward.py similarity index 100% rename from test/test_npu/test_slow_conv_transpose2d_backward.py rename to test/test_npu/test_network_ops/test_slow_conv_transpose2d_backward.py diff --git a/test/test_npu/test_slow_conv_transpose3d.py b/test/test_npu/test_network_ops/test_slow_conv_transpose3d.py similarity index 98% rename from test/test_npu/test_slow_conv_transpose3d.py rename to test/test_npu/test_network_ops/test_slow_conv_transpose3d.py index ca8bf35b8c3a2ad45dd5db340e6ed6ffcd66d648..be25249f9bba4b7985f7171f29f3bc74f62031d9 100644 --- a/test/test_npu/test_slow_conv_transpose3d.py +++ b/test/test_npu/test_network_ops/test_slow_conv_transpose3d.py @@ -56,5 +56,4 @@ class TestSlowConvTranspose3d(TestCase): instantiate_device_type_tests(TestSlowConvTranspose3d, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_soft_margin_loss.py b/test/test_npu/test_network_ops/test_soft_margin_loss.py similarity index 96% rename from test/test_npu/test_soft_margin_loss.py rename to test/test_npu/test_network_ops/test_soft_margin_loss.py index fabfe9147f61d737dd5c5f1b994c87f928d4a4d8..a83172e56db3bd4c6b6247a2b622b1f03bd277c0 100644 --- a/test/test_npu/test_soft_margin_loss.py +++ b/test/test_npu/test_network_ops/test_soft_margin_loss.py @@ -111,8 +111,8 @@ class TestSoftMarginLoss(TestCase): def test_soft_margin_loss_float32_none(self, device): npu_input1, npu_input2 = self.generate_data(-2, 2, (25, 25, 25), (25, 1, 25), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum") - npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum") + cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "none") + npu_output = self.npu_op_exec(npu_input1, npu_input2, "none") self.assertRtolEqual(cpu_output, npu_output) def test_soft_margin_loss_float32_sum(self, device): @@ -123,5 +123,4 @@ class TestSoftMarginLoss(TestCase): instantiate_device_type_tests(TestSoftMarginLoss, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") - run_tests() \ No newline at end of file + run_tests() diff --git a/test/test_npu/test_softmax_backward.py b/test/test_npu/test_network_ops/test_softmax_backward.py similarity index 100% rename from test/test_npu/test_softmax_backward.py rename to test/test_npu/test_network_ops/test_softmax_backward.py diff --git a/test/test_npu/test_softshrink.py b/test/test_npu/test_network_ops/test_softshrink.py similarity index 99% rename from test/test_npu/test_softshrink.py rename to test/test_npu/test_network_ops/test_softshrink.py index 7bbb839f9ea5b317fd9090b6e211006891a7b21b..601bad8486f94b8ca8bfe0d4d736ab27ad96cbfe 100644 --- a/test/test_npu/test_softshrink.py +++ b/test/test_npu/test_network_ops/test_softshrink.py @@ -100,5 +100,4 @@ class TestSoftShrink(TestCase): instantiate_device_type_tests(TestSoftShrink, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_softshrink_backward.py b/test/test_npu/test_network_ops/test_softshrink_backward.py similarity index 98% rename from test/test_npu/test_softshrink_backward.py rename to test/test_npu/test_network_ops/test_softshrink_backward.py index 0681f8658f24c4d7c98ba838d656619c51e6ec3b..5abd88823ac72648f427f0b54ac76dd00450c1ff 100644 --- a/test/test_npu/test_softshrink_backward.py +++ b/test/test_npu/test_network_ops/test_softshrink_backward.py @@ -74,5 +74,4 @@ class TestSoftShrinkBackward(TestCase): instantiate_device_type_tests(TestSoftShrinkBackward, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_split_with_sizes.py b/test/test_npu/test_network_ops/test_split_with_sizes.py similarity index 99% rename from test/test_npu/test_split_with_sizes.py rename to test/test_npu/test_network_ops/test_split_with_sizes.py index 6cae3f107c80eab6a42c54b692629b1c4637fddb..d5226c97411f131e717abc22e12c0b3d226b34e1 100644 --- a/test/test_npu/test_split_with_sizes.py +++ b/test/test_npu/test_network_ops/test_split_with_sizes.py @@ -79,5 +79,4 @@ class Test_split_with_sizes(TestCase): instantiate_device_type_tests(Test_split_with_sizes, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:5") run_tests() diff --git a/test/test_npu/test_square.py b/test/test_npu/test_network_ops/test_square.py similarity index 100% rename from test/test_npu/test_square.py rename to test/test_npu/test_network_ops/test_square.py diff --git a/test/test_npu/test_network_ops/test_std.py b/test/test_npu/test_network_ops/test_std.py index 2d5b70442d1559697662568f1bdf44eb87f0c806..dc04ae778a9185bad8d796f9ba01a3cf49767af7 100644 --- a/test/test_npu/test_network_ops/test_std.py +++ b/test/test_npu/test_network_ops/test_std.py @@ -16,7 +16,7 @@ import torch import numpy as np - +import random from common_utils import TestCase, run_tests from common_device_type import dtypes, instantiate_device_type_tests from util_test import create_common_tensor @@ -159,6 +159,17 @@ class TestStd(TestCase): cpu_output1 = cpu_output1.astype(np.float16) self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002) + random_outputshape = [random.randint(1, 100)] + cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape,item[1],item[0]) + if item[0] == np.float16: + cpu_input1 = cpu_input1.to(torch.float32) + cpu_output = cpu_output.to(torch.float32) + cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5]) + npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5]) + if item[0] == np.float16: + cpu_output1 = cpu_output1.astype(np.float16) + self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002) + def test_std_dim_out_shape_format_fp32(self, device): format_list = [0] shape_list = [[1024], [32, 24], [32, 8, 24], [12, 32, 8, 24]] @@ -177,6 +188,12 @@ class TestStd(TestCase): npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5]) self.assertRtolEqual(cpu_output1, npu_output1) + random_outputshape = [random.randint(1, 100)] + cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape, item[1], item[0]) + cpu_output2 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output.clone(), item[4], item[5]) + npu_output2 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5]) + self.assertRtolEqual(cpu_output2, npu_output2) + def test_std_dim_name_fp16(self, device): shape = (1024, 8, 32) cpu_input = torch.rand(shape, dtype=torch.float32) @@ -229,6 +246,40 @@ class TestStd(TestCase): npu_output = torch.std(npu_input, dim=dim,out=npu_output) self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy()) + def test_std_n_dim_shape_format_fp16(self, device): + format_list = [0] + shape_list = [[128, 32, 8, 1023]] + dim_list = [(3, 1)] + unbiased_list = [True, False] + keepdim_list = [True, False] + shape_format = [ + [np.float16, i, j, k, l, m] for i in format_list for j in shape_list + for k in dim_list for l in unbiased_list for m in keepdim_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_input1 = cpu_input1.to(torch.float32) + cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5]) + cpu_output1 = cpu_output1.astype(np.float16) + npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5]) + self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.003) + + def test_std_n_dim_shape_format_fp32(self, device): + format_list = [0] + shape_list = [[128, 32, 8, 1023]] + dim_list = [(3, 1)] + unbiased_list = [True, False] + keepdim_list = [True, False] + shape_format = [ + [np.float32, i, j, k, l, m] for i in format_list for j in shape_list + for k in dim_list for l in unbiased_list for m in keepdim_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item, 0, 100) + cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5]) + npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5]) + self.assertRtolEqual(cpu_output1, npu_output1) + instantiate_device_type_tests(TestStd, globals(), except_for="cpu") if __name__ == "__main__": run_tests() diff --git a/test/test_npu/test_sum_to_size.py b/test/test_npu/test_network_ops/test_sum_to_size.py similarity index 98% rename from test/test_npu/test_sum_to_size.py rename to test/test_npu/test_network_ops/test_sum_to_size.py index 1820b9d95962034f273cd50a307018c4701c8374..b7316671611250cc4b845047f00f4825fda60034 100644 --- a/test/test_npu/test_sum_to_size.py +++ b/test/test_npu/test_network_ops/test_sum_to_size.py @@ -64,5 +64,4 @@ class TestSumToSize(TestCase): instantiate_device_type_tests(TestSumToSize, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() diff --git a/test/test_npu/test_take.py b/test/test_npu/test_network_ops/test_take.py similarity index 100% rename from test/test_npu/test_take.py rename to test/test_npu/test_network_ops/test_take.py diff --git a/test/test_npu/test_tan.py b/test/test_npu/test_network_ops/test_tan.py similarity index 100% rename from test/test_npu/test_tan.py rename to test/test_npu/test_network_ops/test_tan.py diff --git a/test/test_npu/test_tensor_npu.py b/test/test_npu/test_network_ops/test_tensor_npu.py similarity index 84% rename from test/test_npu/test_tensor_npu.py rename to test/test_npu/test_network_ops/test_tensor_npu.py index abe6ce9b020af4fbfa8c26ab6322e8608a86b172..e0fdf11d358bf61bc29cd61188829017c8155b7f 100644 --- a/test/test_npu/test_tensor_npu.py +++ b/test/test_npu/test_network_ops/test_tensor_npu.py @@ -16,8 +16,8 @@ import torch import numpy as np -from torch.testing._internal.common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests +from common_utils import TestCase, run_tests +from common_device_type import instantiate_device_type_tests from util_test import create_common_tensor class TestTensorNpu(TestCase): @@ -27,7 +27,7 @@ class TestTensorNpu(TestCase): return output def npu_op_exec(self, input): - output = torch.npu() + output = input.npu() output = output.to("cpu") return output @@ -37,11 +37,11 @@ class TestTensorNpu(TestCase): return output def npu_type_exec(self, input): - output = torch.npu() + output = input.npu() output = output.is_npu return output - def test_tensor_npu_shape_format(self): + def test_tensor_npu_shape_format(self, device): shape_format = [ [np.float32, 0, 1], [np.float32, 0, (64, 10)], @@ -53,9 +53,9 @@ class TestTensorNpu(TestCase): cpu_input, npu_input = create_common_tensor(item, 1, 100) cpu_output = self.cpu_op_exec(cpu_input) npu_output = self.npu_op_exec(npu_input) - self.assertRtolEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output.cpu()) - def test_is_npu_shape_format(self): + def test_is_npu_shape_format(self, device): shape_format = [ [np.float32, 0, 1], [np.float32, 0, (64, 10)], @@ -70,5 +70,6 @@ class TestTensorNpu(TestCase): self.assertEqual(cpu_output, False) self.assertEqual(npu_output, True) +instantiate_device_type_tests(TestTensorNpu, globals(), except_for='cpu') if __name__ == "__main__": run_tests() diff --git a/test/test_npu/test_thnn_conv_depthwise2d_backward.py b/test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_backward.py similarity index 100% rename from test/test_npu/test_thnn_conv_depthwise2d_backward.py rename to test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_backward.py diff --git a/test/test_npu/test_thnn_conv_depthwise2d_forward.py b/test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_forward.py similarity index 100% rename from test/test_npu/test_thnn_conv_depthwise2d_forward.py rename to test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_forward.py diff --git a/test/test_npu/test_threshold_grad_v2_d.py b/test/test_npu/test_network_ops/test_threshold_grad_v2_d.py similarity index 99% rename from test/test_npu/test_threshold_grad_v2_d.py rename to test/test_npu/test_network_ops/test_threshold_grad_v2_d.py index f4307075bb0e9bc0b1915515fea7a4ab6f7523a6..d5baa926e98c387e1207dd934692c400b990084f 100644 --- a/test/test_npu/test_threshold_grad_v2_d.py +++ b/test/test_npu/test_network_ops/test_threshold_grad_v2_d.py @@ -79,5 +79,4 @@ class TestThresholdGradV2DBackward(TestCase): instantiate_device_type_tests(TestThresholdGradV2DBackward, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() diff --git a/test/test_npu/test_threshold_v2_d.py b/test/test_npu/test_network_ops/test_threshold_v2_d.py similarity index 100% rename from test/test_npu/test_threshold_v2_d.py rename to test/test_npu/test_network_ops/test_threshold_v2_d.py diff --git a/test/test_npu/test_trapz_dx.py b/test/test_npu/test_network_ops/test_trapz_dx.py similarity index 96% rename from test/test_npu/test_trapz_dx.py rename to test/test_npu/test_network_ops/test_trapz_dx.py index 900d890c4c4848a464ec32ef4787ff32a8a9db9f..c2e2d4aa9c941cf0a2c0ff491728e1e22076da3e 100644 --- a/test/test_npu/test_trapz_dx.py +++ b/test/test_npu/test_network_ops/test_trapz_dx.py @@ -20,7 +20,7 @@ import sys import copy from common_utils import TestCase, run_tests from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor,compare_res_new +from util_test import create_common_tensor class TestTrapzDx(TestCase): @@ -76,5 +76,4 @@ class TestTrapzDx(TestCase): instantiate_device_type_tests(TestTrapzDx, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_trapz_x.py b/test/test_npu/test_network_ops/test_trapz_x.py similarity index 99% rename from test/test_npu/test_trapz_x.py rename to test/test_npu/test_network_ops/test_trapz_x.py index 2be857a74fe85de99f0f7a828636fea9cd3457cf..84e563bb56c0a174a25293d76ee6640886a9efa4 100644 --- a/test/test_npu/test_trapz_x.py +++ b/test/test_npu/test_network_ops/test_trapz_x.py @@ -88,5 +88,4 @@ class TestTrapzX(TestCase): instantiate_device_type_tests(TestTrapzX, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:2") run_tests() diff --git a/test/test_npu/test_true_divide.py b/test/test_npu/test_network_ops/test_true_divide.py similarity index 99% rename from test/test_npu/test_true_divide.py rename to test/test_npu/test_network_ops/test_true_divide.py index 4beaf0cd357a3e8fc2a3d1bebd75b506ad2c47de..24f06fe4b555d5b73cfdff84cb3ac1890354a18e 100644 --- a/test/test_npu/test_true_divide.py +++ b/test/test_npu/test_network_ops/test_true_divide.py @@ -128,5 +128,4 @@ class TestTrueDivide(TestCase): instantiate_device_type_tests(TestTrueDivide, globals() , except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:7") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_trunc.py b/test/test_npu/test_network_ops/test_trunc.py similarity index 98% rename from test/test_npu/test_trunc.py rename to test/test_npu/test_network_ops/test_trunc.py index 806c9f6aa1dbaec3dae7348abb105cdd545da940..953a850bc57647c714e725ad08b03c7600f20da5 100644 --- a/test/test_npu/test_trunc.py +++ b/test/test_npu/test_network_ops/test_trunc.py @@ -79,7 +79,6 @@ class TestTrunc(TestCase): instantiate_device_type_tests(TestTrunc, globals(), except_for='cpu') if __name__=="__main__": - torch.npu.set_device("npu:7") run_tests() diff --git a/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py b/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py index 8b344664401627b46ea28057c1633fd8b5844c3d..0493dc0f6664b6e803f288f59693ccb25e28c968 100644 --- a/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py +++ b/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py @@ -61,7 +61,6 @@ class TestUnbindToContiguous(TestCase): cpu_time += cpu_end - cpu_start npu_time += npu_end - npu_start self.assertRtolEqual(cpu_output, npu_output) - self.assertTrue(npu_time < 15, f"execute time:{npu_time:.2f}s should be less than 15s") print(f"unbind to contiguous use: {cpu_time:.5f} s (CPU)") print(f"unbind to contiguous use: {npu_time:.5f} s (NPU)") print(f"TBE Ops used: Slice") diff --git a/test/test_npu/test_upsample_bilinear2d.py b/test/test_npu/test_network_ops/test_upsample_bilinear2d.py similarity index 98% rename from test/test_npu/test_upsample_bilinear2d.py rename to test/test_npu/test_network_ops/test_upsample_bilinear2d.py index 22c597f833c681003728e2db93c768130ae06ec2..c3e30ede9d18d4a653586a5ad180efc83a439ad9 100644 --- a/test/test_npu/test_upsample_bilinear2d.py +++ b/test/test_npu/test_network_ops/test_upsample_bilinear2d.py @@ -54,5 +54,4 @@ class TestUpsampleBilinear2d(TestCase): instantiate_device_type_tests(TestUpsampleBilinear2d, globals(), except_for='cpu') if __name__ == "__main__": - torch.npu.set_device("npu:7") run_tests() \ No newline at end of file diff --git a/test/test_npu/test_upsample_linear1d.py b/test/test_npu/test_network_ops/test_upsample_linear1d.py similarity index 99% rename from test/test_npu/test_upsample_linear1d.py rename to test/test_npu/test_network_ops/test_upsample_linear1d.py index 982b1a6eeb5e6285e6156493446e5d600b9e38c7..e2e7478b7c79b9cff3ec029fe64f4b65047e8200 100644 --- a/test/test_npu/test_upsample_linear1d.py +++ b/test/test_npu/test_network_ops/test_upsample_linear1d.py @@ -106,5 +106,4 @@ class TestUpsampleLinear1D(TestCase): instantiate_device_type_tests(TestUpsampleLinear1D, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:3") run_tests() diff --git a/test/test_npu/test_upsample_nearest2d.py b/test/test_npu/test_network_ops/test_upsample_nearest2d.py similarity index 100% rename from test/test_npu/test_upsample_nearest2d.py rename to test/test_npu/test_network_ops/test_upsample_nearest2d.py diff --git a/test/test_npu/test_upsample_nearest2d_backward.py b/test/test_npu/test_network_ops/test_upsample_nearest2d_backward.py similarity index 100% rename from test/test_npu/test_upsample_nearest2d_backward.py rename to test/test_npu/test_network_ops/test_upsample_nearest2d_backward.py diff --git a/test/test_npu/test_network_ops/test_upsample_nearest3d.py b/test/test_npu/test_network_ops/test_upsample_nearest3d.py new file mode 100644 index 0000000000000000000000000000000000000000..6aae87d2d9c245c18e84ba2ef25ddf29d298bb7d --- /dev/null +++ b/test/test_npu/test_network_ops/test_upsample_nearest3d.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import torch.nn.functional as F +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + +# 3d need input's dim is 5 +class TestUpsamleNearest3D(TestCase): + def cpu_op_exec(self, input, size): + output = torch.nn.functional.interpolate(input, size, mode="nearest") + output = output.numpy() + return output + + def npu_op_exec(self, input, size): + output = torch.nn.functional.interpolate(input, size, mode="nearest") + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_scale_exec(self, input, size): + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest") + output = output.numpy() + return output + + def npu_op_scale_exec(self, input, size): + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest") + output = output.to("cpu") + output = output.numpy() + return output + + def test_upsample_nearest3d_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_exec(cpu_input, size) + npu_output = self.npu_op_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_upsample_nearest3d_shape_format_scale(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_scale_exec(cpu_input, size) + npu_output = self.npu_op_scale_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestUpsamleNearest3D, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_npu/test_network_ops/test_upsample_nearest3d_backward.py b/test/test_npu/test_network_ops/test_upsample_nearest3d_backward.py new file mode 100644 index 0000000000000000000000000000000000000000..3d303af74f5b24509c0993923da75f1d272aecaf --- /dev/null +++ b/test/test_npu/test_network_ops/test_upsample_nearest3d_backward.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import torch.nn.functional as F +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + + +# 3d need input's dim is 5 +class TestUpsamleNearest3DBackward(TestCase): + def cpu_op_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, size, mode="nearest") + output.sum().backward() + output = input.grad.numpy() + return output + + def npu_op_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, size, mode="nearest") + output.sum().backward() + output = input.grad.to("cpu") + output = output.numpy() + return output + + def cpu_op_scale_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest") + output.sum().backward() + output = input.grad.numpy() + return output + + def npu_op_scale_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest") + output.sum().backward() + output = input.grad.to("cpu") + output = output.numpy() + return output + + def test_upsample_nearest3d_backward_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_exec(cpu_input, size) + npu_output = self.npu_op_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_upsample_nearest3d_backward_shape_format_scale(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_scale_exec(cpu_input, size) + npu_output = self.npu_op_scale_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestUpsamleNearest3DBackward, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_npu/test_network_ops/test_upsample_trilinear3d.py b/test/test_npu/test_network_ops/test_upsample_trilinear3d.py new file mode 100644 index 0000000000000000000000000000000000000000..0bdb4a5907dac56c2d6c58ec8777e7981ad3458c --- /dev/null +++ b/test/test_npu/test_network_ops/test_upsample_trilinear3d.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import torch.nn.functional as F +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + +# 3d need input's dim is 5 +class TestUpsamleTrilinear3D(TestCase): + def cpu_op_exec(self, input, size): + output = torch.nn.functional.interpolate(input, size, mode="trilinear") + output = output.numpy() + return output + + def npu_op_exec(self, input, size): + output = torch.nn.functional.interpolate(input, size, mode="trilinear") + output = output.to("cpu") + output = output.numpy() + return output + + def cpu_op_scale_exec(self, input, size): + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear") + output = output.numpy() + return output + + def npu_op_scale_exec(self, input, size): + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear") + output = output.to("cpu") + output = output.numpy() + return output + + def test_upsample_trilinear3d_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_exec(cpu_input, size) + npu_output = self.npu_op_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_upsample_trilinear3d_shape_format_scale(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_scale_exec(cpu_input, size) + npu_output = self.npu_op_scale_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestUpsamleTrilinear3D, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_npu/test_network_ops/test_upsample_trilinear3d_backward.py b/test/test_npu/test_network_ops/test_upsample_trilinear3d_backward.py new file mode 100644 index 0000000000000000000000000000000000000000..24b9c92b2cfdfd15c448a82ed7550f3e6124aeec --- /dev/null +++ b/test/test_npu/test_network_ops/test_upsample_trilinear3d_backward.py @@ -0,0 +1,91 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import torch.nn.functional as F +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + +# 3d need input's dim is 5 +class TestUpsamleTrilinear3DBackward(TestCase): + def cpu_op_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, size, mode="trilinear") + output.sum().backward() + output = input.grad.numpy() + return output + + def npu_op_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, size, mode="trilinear") + output.sum().backward() + output = input.grad.to("cpu") + output = output.numpy() + return output + + def cpu_op_scale_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear") + output.sum().backward() + output = input.grad.numpy() + return output + + def npu_op_scale_exec(self, input, size): + input.requires_grad_(True) + output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear") + output.sum().backward() + output = input.grad.to("cpu") + output = output.numpy() + return output + + def test_upsample_trilinear3d_backward_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_exec(cpu_input, size) + npu_output = self.npu_op_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + def test_upsample_trilinear3d_backward_shape_format_scale(self, device): + shape_format = [ + [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]], + [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]], + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 50) + if cpu_input == torch.float16: + cpu_input = cpu_input.to(torch.float32) + + size = item[1] + cpu_output = self.cpu_op_scale_exec(cpu_input, size) + npu_output = self.npu_op_scale_exec(npu_input, size) + cpu_output = cpu_output.astype(npu_output.dtype) + self.assertRtolEqual(cpu_output, npu_output) + + +instantiate_device_type_tests(TestUpsamleTrilinear3DBackward, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_npu/test_trace.py b/test/test_npu/test_trace.py deleted file mode 100644 index d4cf66f10a1ef3f51057d0095170fe20917b4a2c..0000000000000000000000000000000000000000 --- a/test/test_npu/test_trace.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2020, Huawei Technologies.All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import copy -import torch.nn as nn -import numpy as np -from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor - -LOWER = 0 -UPPER = 2 -INT_UPPER = 5 - - -class TestTrace(TestCase): - - def generate_one_input(self, lower, upper, shape, dtype): - input1 = np.random.uniform(lower, upper, shape).astype(dtype) - npu_input1 = torch.from_numpy(input1) - return npu_input1 - - - def cpu_op_exec(self, input1): - res = torch.trace(input1) - return res.numpy() - - - def cpu_op_exec_half(self, input1): - res = torch.trace(input1) - return res.type(torch.float16).numpy() - - - def npu_op_exec(self, input1): - input1 = input1.to("npu") - res = torch.trace(input1) - res = res.to("cpu") - return res.numpy() - - - def test_trace_float32(self, device): - for shape in [(10, 10), (10, 11), (11, 10)]: - input1 = generate_one_input(LOWER, UPPER, shape, np.float32) - cpu_output = cpu_op_exec(input1) - npu_output = npu_op_exec(input1) - self.assertRtolEqual(cpu_output, npu_output) - - - def test_trace_float16(self, device): - shape = (10, 10) - input1 = generate_one_input(LOWER, UPPER, shape, np.float16) - cpu_output = cpu_op_exec_half(input1.type(torch.float32)) - npu_output = npu_op_exec(input1) - self.assertRtolEqual(cpu_output, npu_output) - - - def test_trace_int(self, device): - for shape, dtype in [ - ((10, 10), np.uint8), - ((10, 10), np.int8), - ((10, 10), np.int32) - ]: - input1 = np.random.randint(LOWER, INT_UPPER, shape, dtype) - input1 = torch.from_numpy(input1) - cpu_output = torch.trace(input1).numpy().astype(np.int32) - input_npu = input1.to("npu") - npu_output = torch.trace(input_npu) - npu_output = npu_output.to("cpu").numpy().astype(np.int32) - self.assertRtolEqual(cpu_output, npu_output) - - -instantiate_device_type_tests(TestTrace, globals(), except_for="cpu") -if __name__ == "__main__": - torch.npu.set_device("npu:3") - run_tests()