diff --git a/patch/npu.patch b/patch/npu.patch index a46a27034ab9d7433a4e60c9d68d86898a340dab..4e01faf27cccb57db1074605f20a5c9883360123 100644 --- a/patch/npu.patch +++ b/patch/npu.patch @@ -1,6 +1,6 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt --- pytorch-v1.5.0/aten/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/CMakeLists.txt 2021-07-05 14:59:26.416336304 +0800 ++++ pytorch-develop/aten/CMakeLists.txt 2021-07-09 17:16:47.786789915 +0800 @@ -22,8 +22,10 @@ set(ATen_CPU_INCLUDE) set(ATen_THIRD_PARTY_INCLUDE) @@ -51,7 +51,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt --- pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/CMakeLists.txt 2021-07-05 14:59:26.416336304 +0800 ++++ pytorch-develop/aten/src/ATen/CMakeLists.txt 2021-07-09 17:16:47.786789915 +0800 @@ -67,6 +67,9 @@ FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h") FILE(GLOB native_cpu_h "native/cpu/*.h") @@ -129,7 +129,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h --- pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h 2021-07-05 14:59:26.424336365 +0800 ++++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h 2021-07-09 17:16:47.794790202 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -170,7 +170,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py --- pytorch-v1.5.0/aten/src/ATen/function_wrapper.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/function_wrapper.py 2021-07-05 14:59:26.432336426 +0800 ++++ pytorch-develop/aten/src/ATen/function_wrapper.py 2021-07-09 17:16:47.802790488 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -355,7 +355,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= for option in declaration['options']: diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py --- pytorch-v1.5.0/aten/src/ATen/gen.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/gen.py 2021-07-05 14:59:26.432336426 +0800 ++++ pytorch-develop/aten/src/ATen/gen.py 2021-07-09 17:16:47.802790488 +0800 @@ -1,3 +1,18 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -513,7 +513,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= generate_outputs() diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp --- pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp 2021-07-05 14:59:26.444336518 +0800 ++++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp 2021-07-09 17:16:47.814790918 +0800 @@ -339,20 +339,20 @@ void hardsigmoid_backward_kernel(TensorIterator& iter) { @@ -541,7 +541,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= }); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp --- pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/Memory.cpp 2021-07-05 14:59:26.440336488 +0800 ++++ pytorch-develop/aten/src/ATen/native/Memory.cpp 2021-07-09 17:16:47.806790632 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -596,7 +596,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= detail::computeStorageSize(self.sizes(), self.strides()), diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml --- pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/native_functions.yaml 2021-07-05 14:59:26.460336640 +0800 ++++ pytorch-develop/aten/src/ATen/native/native_functions.yaml 2021-07-09 17:16:47.830791493 +0800 @@ -1,6 +1,5 @@ # See README.md in this directory for more guidance @@ -2324,16 +2324,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor requires_tensor: True -@@ -1801,6 +2302,8 @@ - requires_tensor: True - dispatch: - QuantizedCPU: quantized_max_pool2d -+ npu_dispatch: -+ NPU: quantized_max_pool2d_npu - - - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor - supports_named_tensor: True -@@ -1814,6 +2317,8 @@ +@@ -1814,6 +2315,8 @@ CPU: mean_cpu_gpu CUDA: mean_cpu_gpu QuantizedCPU: quantized_mean_cpu @@ -2342,7 +2333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method -@@ -1822,6 +2327,8 @@ +@@ -1822,6 +2325,8 @@ CPU: mean_cpu_gpu CUDA: mean_cpu_gpu QuantizedCPU: quantized_mean_cpu @@ -2351,7 +2342,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -1829,47 +2336,73 @@ +@@ -1829,47 +2334,73 @@ CPU: mean_out_cpu_gpu CUDA: mean_out_cpu_gpu QuantizedCPU: quantized_mean_out_cpu @@ -2425,7 +2416,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor -@@ -1958,6 +2491,8 @@ +@@ -1958,6 +2489,8 @@ CUDA: legacy::cuda::_th_mm SparseCPU: _sparse_mm SparseCUDA: _sparse_mm @@ -2434,7 +2425,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) -@@ -1966,6 +2501,8 @@ +@@ -1966,6 +2499,8 @@ CUDA: legacy::cuda::_th_mm_out SparseCPU: _sparse_mm_out SparseCUDA: _sparse_mm_out @@ -2443,21 +2434,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor -@@ -1974,9 +2511,13 @@ - - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) - supports_named_tensor: True - variants: function, method -+ npu_dispatch: -+ NPU: mode_npu - - - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) - supports_named_tensor: True -+ npu_dispatch: -+ NPU: mode_out_npu - - - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - variants: function, method -@@ -1994,6 +2535,8 @@ +@@ -1994,6 +2529,8 @@ SparseCPU: mul_sparse SparseCUDA: mul_sparse MkldnnCPU: mkldnn_mul @@ -2466,7 +2443,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) -@@ -2004,6 +2547,8 @@ +@@ -2004,6 +2541,8 @@ SparseCPU: mul_sparse_ SparseCUDA: mul_sparse_ MkldnnCPU: mkldnn_mul_ @@ -2475,7 +2452,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) -@@ -2013,15 +2558,21 @@ +@@ -2013,15 +2552,21 @@ SparseCPU: mul_out_sparse_cpu SparseCUDA: mul_out_sparse_cuda MkldnnCPU: mkldnn_mul_out @@ -2497,7 +2474,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mv(Tensor self, Tensor vec) -> Tensor use_c10_dispatcher: full -@@ -2030,12 +2581,16 @@ +@@ -2030,12 +2575,16 @@ CPU: mv_cpu CUDA: legacy::cuda::_th_mv supports_named_tensor: True @@ -2514,7 +2491,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mvlgamma(Tensor self, int p) -> Tensor use_c10_dispatcher: full -@@ -2052,6 +2607,8 @@ +@@ -2052,6 +2601,8 @@ CUDA: narrow_copy_dense SparseCPU: narrow_copy_sparse SparseCUDA: narrow_copy_sparse @@ -2523,7 +2500,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a) variants: function, method -@@ -2068,6 +2625,8 @@ +@@ -2068,6 +2619,8 @@ CPU: batch_norm_cpu CUDA: batch_norm_cuda MkldnnCPU: mkldnn_batch_norm @@ -2532,7 +2509,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) dispatch: -@@ -2098,6 +2657,8 @@ +@@ -2098,6 +2651,8 @@ dispatch: CPU: batch_norm_backward_cpu CUDA: batch_norm_backward_cuda @@ -2541,7 +2518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) dispatch: -@@ -2117,6 +2678,8 @@ +@@ -2117,6 +2672,8 @@ - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor variants: function @@ -2550,7 +2527,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor) variants: function -@@ -2129,42 +2692,60 @@ +@@ -2129,42 +2686,60 @@ - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -2613,7 +2590,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Only exposed from C++ -- in Python, # we expose it as an attribute `T`, not a function. -@@ -2253,54 +2834,82 @@ +@@ -2253,54 +2828,82 @@ supports_named_tensor: True - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -2697,7 +2674,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor use_c10_dispatcher: full -@@ -2316,6 +2925,8 @@ +@@ -2316,6 +2919,8 @@ - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor use_c10_dispatcher: full variants: function, method @@ -2706,7 +2683,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: reshape(Tensor self, int[] shape) -> Tensor variants: function, method -@@ -2337,16 +2948,22 @@ +@@ -2337,16 +2942,22 @@ use_c10_dispatcher: full supports_named_tensor: True variants: function, method @@ -2729,7 +2706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor -@@ -2360,6 +2977,8 @@ +@@ -2360,6 +2971,8 @@ CUDA: relu MkldnnCPU: mkldnn_relu QuantizedCPU: quantized_relu @@ -2738,7 +2715,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: relu_(Tensor(a!) self) -> Tensor(a!) -@@ -2370,6 +2989,8 @@ +@@ -2370,6 +2983,8 @@ CUDA: relu_ MkldnnCPU: mkldnn_relu_ QuantizedCPU: quantized_relu_ @@ -2747,7 +2724,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: prelu(Tensor self, Tensor weight) -> Tensor use_c10_dispatcher: full -@@ -2377,12 +2998,16 @@ +@@ -2377,12 +2992,16 @@ dispatch: CPU: prelu_cpu CUDA: prelu_cuda @@ -2764,7 +2741,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gelu(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2390,6 +3015,8 @@ +@@ -2390,6 +3009,8 @@ dispatch: CPU: gelu_cpu CUDA: gelu_cuda @@ -2773,7 +2750,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gelu_backward(Tensor grad, Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2397,29 +3024,41 @@ +@@ -2397,29 +3018,41 @@ dispatch: CPU: gelu_backward_cpu CUDA: gelu_backward_cuda @@ -2815,7 +2792,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) variants: function, method -@@ -2433,14 +3072,21 @@ +@@ -2433,14 +3066,21 @@ - func: selu(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2838,7 +2815,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sigmoid(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2451,6 +3097,8 @@ +@@ -2451,6 +3091,8 @@ CUDA: sigmoid QuantizedCPU: quantized_sigmoid MkldnnCPU: mkldnn_sigmoid @@ -2847,7 +2824,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sigmoid_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -2459,36 +3107,52 @@ +@@ -2459,36 +3101,52 @@ CPU: sigmoid_ CUDA: sigmoid_ MkldnnCPU: mkldnn_sigmoid_ @@ -2900,7 +2877,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. -@@ -2533,6 +3197,8 @@ +@@ -2533,6 +3191,8 @@ - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) variants: function, method @@ -2909,7 +2886,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: smm(Tensor self, Tensor mat2) -> Tensor use_c10_dispatcher: full -@@ -2542,10 +3208,14 @@ +@@ -2542,10 +3202,14 @@ - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor variants: function, method supports_named_tensor: True @@ -2924,7 +2901,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor use_c10_dispatcher: full -@@ -2553,12 +3223,16 @@ +@@ -2553,12 +3217,16 @@ CPU: softmax_cpu CUDA: softmax_cuda MkldnnCPU: mkldnn_softmax @@ -2941,7 +2918,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[] variants: function, method -@@ -2609,8 +3283,12 @@ +@@ -2609,8 +3277,12 @@ SparseCUDA: _sspaddmm_out_cuda - func: stack(Tensor[] tensors, int dim=0) -> Tensor @@ -2954,7 +2931,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # The signature is designed to be consistent with librosa except that it is # missing the `pad_mode` and `center` arguments, which are taken care of at -@@ -2633,20 +3311,30 @@ +@@ -2633,20 +3305,30 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor variants: function, method supports_named_tensor: True @@ -2985,7 +2962,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sum_to_size(Tensor self, int[] size) -> Tensor variants: method -@@ -2656,13 +3344,19 @@ +@@ -2656,13 +3338,19 @@ use_c10_dispatcher: full supports_named_tensor: True variants: function, method @@ -3005,7 +2982,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: square(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2677,51 +3371,81 @@ +@@ -2677,51 +3365,81 @@ use_c10_dispatcher: full variants: function, method supports_named_tensor: True @@ -3088,7 +3065,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: t(Tensor(a) self) -> Tensor(a) device_guard: False -@@ -2736,6 +3460,8 @@ +@@ -2736,6 +3454,8 @@ use_c10_dispatcher: full supports_named_tensor: True variants: function, method @@ -3097,7 +3074,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tan_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -2743,12 +3469,16 @@ +@@ -2743,12 +3463,16 @@ dispatch: CPU: _tan__cpu CUDA: _tan__cuda @@ -3114,7 +3091,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tanh(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -2758,6 +3488,8 @@ +@@ -2758,6 +3482,8 @@ CPU: tanh CUDA: tanh QuantizedCPU: quantized_tanh @@ -3123,7 +3100,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tanh_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -2765,12 +3497,16 @@ +@@ -2765,12 +3491,16 @@ dispatch: CPU: _tanh__cpu CUDA: _tanh__cuda @@ -3140,7 +3117,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor variants: function -@@ -2783,6 +3519,8 @@ +@@ -2783,6 +3513,8 @@ dispatch: CPU: threshold CUDA: threshold_cuda @@ -3149,7 +3126,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!) variants: function -@@ -2790,12 +3528,16 @@ +@@ -2790,12 +3522,16 @@ dispatch: CPU: threshold_ CUDA: threshold__cuda @@ -3166,7 +3143,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor use_c10_dispatcher: full -@@ -2803,6 +3545,8 @@ +@@ -2803,6 +3539,8 @@ dispatch: CPU: threshold_backward CUDA: threshold_backward_cuda @@ -3175,7 +3152,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a) variants: function, method -@@ -2835,18 +3579,24 @@ +@@ -2835,18 +3573,24 @@ use_c10_dispatcher: full python_module: nn variants: function @@ -3200,7 +3177,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args -@@ -2872,6 +3622,8 @@ +@@ -2872,6 +3616,8 @@ CUDA: true_divide SparseCPU: true_divide_sparse SparseCUDA: true_divide_sparse @@ -3209,7 +3186,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) -@@ -2881,6 +3633,8 @@ +@@ -2881,6 +3627,8 @@ CUDA: true_divide_ SparseCPU: true_divide_sparse_ SparseCUDA: true_divide_sparse_ @@ -3218,7 +3195,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) -@@ -2889,31 +3643,43 @@ +@@ -2889,31 +3637,43 @@ CUDA: true_divide_out SparseCPU: true_divide_out_sparse_zerodim SparseCUDA: true_divide_out_sparse_zerodim @@ -3262,7 +3239,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: type_as(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -2956,6 +3722,8 @@ +@@ -2956,6 +3716,8 @@ dispatch: CPU: _unique2_cpu CUDA: _unique2_cuda @@ -3271,7 +3248,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _unsafe_view(Tensor self, int[] size) -> Tensor -@@ -2971,32 +3739,48 @@ +@@ -2971,32 +3733,48 @@ use_c10_dispatcher: full variants: function, method supports_named_tensor: True @@ -3320,7 +3297,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: view_as(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -3009,13 +3793,19 @@ +@@ -3009,13 +3787,19 @@ - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function, method @@ -3340,7 +3317,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor variants: function -@@ -3041,13 +3831,21 @@ +@@ -3041,13 +3825,21 @@ - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -3362,7 +3339,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor use_c10_dispatcher: full -@@ -3100,25 +3898,37 @@ +@@ -3100,25 +3892,37 @@ - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor dispatch: @@ -3402,7 +3379,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor variants: function, method -@@ -3162,12 +3972,16 @@ +@@ -3162,12 +3966,16 @@ SparseCUDA: clone_sparse MkldnnCPU: mkldnn_clone QuantizedCPU: quantized_clone @@ -3419,7 +3396,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -3176,6 +3990,8 @@ +@@ -3176,6 +3984,8 @@ CUDA: pow_out SparseCPU: pow_out_sparse_scalar SparseCUDA: pow_out_sparse_scalar @@ -3428,7 +3405,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor use_c10_dispatcher: full -@@ -3186,6 +4002,8 @@ +@@ -3186,6 +3996,8 @@ CUDA: pow SparseCPU: pow_sparse_scalar SparseCUDA: pow_sparse_scalar @@ -3437,7 +3414,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: zero_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -3196,6 +4014,14 @@ +@@ -3196,6 +4008,14 @@ SparseCPU: zero_sparse_ SparseCUDA: zero_sparse_ MkldnnCPU: mkldnn_zero_ @@ -3452,7 +3429,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: -@@ -3204,6 +4030,8 @@ +@@ -3204,6 +4024,8 @@ SparseCPU: sub_out_sparse SparseCUDA: sub_out_sparse supports_named_tensor: True @@ -3461,7 +3438,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full -@@ -3213,6 +4041,8 @@ +@@ -3213,6 +4035,8 @@ CUDA: sub SparseCPU: sub_sparse SparseCUDA: sub_sparse @@ -3470,7 +3447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) -@@ -3222,6 +4052,8 @@ +@@ -3222,6 +4046,8 @@ CUDA: sub_ SparseCPU: sub_sparse_ SparseCUDA: sub_sparse_ @@ -3479,7 +3456,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True # For C++ only, until we have conversion from C++ numbers to Tensor -@@ -3229,21 +4061,29 @@ +@@ -3229,21 +4055,29 @@ use_c10_dispatcher: full variants: function, method supports_named_tensor: True @@ -3509,7 +3486,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Functionally the same as addmm, but we give it a different derivative formula # that doesn't propagate gradients to non-present entries on sparse. -@@ -3257,6 +4097,8 @@ +@@ -3257,6 +4091,8 @@ CUDA: legacy::cuda::_th_addmm_out SparseCPU: addmm_out_sparse_dense_cpu SparseCUDA: addmm_out_sparse_dense_cuda @@ -3518,7 +3495,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor -@@ -3267,6 +4109,8 @@ +@@ -3267,6 +4103,8 @@ CUDA: legacy::cuda::_th_addmm SparseCPU: addmm_sparse_dense_cpu SparseCUDA: addmm_sparse_dense_cuda @@ -3527,7 +3504,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) -@@ -3278,9 +4122,10 @@ +@@ -3278,9 +4116,10 @@ # broadcasting SparseCPU: s_addmm_sparse_dense_cpu_ SparseCUDA: s_addmm_sparse_dense_cuda_ @@ -3539,7 +3516,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # NOTE [ Sparse: autograd and API ] # # -@@ -3396,7 +4241,6 @@ +@@ -3396,7 +4235,6 @@ # shared. In other words, their outputs are non-differentiable views of the # sparse tensor. @@ -3547,7 +3524,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. - func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor -@@ -3433,7 +4277,6 @@ +@@ -3433,7 +4271,6 @@ SparseCUDA: sparse_resize_and_clear_ requires_tensor: True @@ -3555,7 +3532,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sparse_mask(Tensor self, Tensor mask) -> Tensor use_c10_dispatcher: full variants: method -@@ -3442,7 +4285,6 @@ +@@ -3442,7 +4279,6 @@ SparseCUDA: sparse_mask_cuda requires_tensor: True @@ -3563,7 +3540,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: to_dense(Tensor self) -> Tensor use_c10_dispatcher: full variants: method -@@ -3474,7 +4316,6 @@ +@@ -3474,7 +4310,6 @@ requires_tensor: True device_guard: False @@ -3571,7 +3548,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: dense_dim(Tensor self) -> int use_c10_dispatcher: full variants: method -@@ -3494,7 +4335,6 @@ +@@ -3494,7 +4329,6 @@ requires_tensor: True device_guard: False @@ -3579,7 +3556,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _nnz(Tensor self) -> int use_c10_dispatcher: full variants: method -@@ -3504,7 +4344,6 @@ +@@ -3504,7 +4338,6 @@ requires_tensor: True device_guard: False @@ -3587,7 +3564,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: coalesce(Tensor self) -> Tensor use_c10_dispatcher: full variants: method -@@ -3513,7 +4352,6 @@ +@@ -3513,7 +4346,6 @@ SparseCUDA: coalesce_sparse_cuda requires_tensor: True @@ -3595,7 +3572,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: is_coalesced(Tensor self) -> bool use_c10_dispatcher: full variants: method -@@ -3524,7 +4362,6 @@ +@@ -3524,7 +4356,6 @@ device_guard: False supports_named_tensor: True @@ -3603,7 +3580,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: -@@ -3568,7 +4405,6 @@ +@@ -3568,7 +4399,6 @@ requires_tensor: True device_guard: False @@ -3611,7 +3588,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: SparseCPU: hspmm_out_sparse_cpu -@@ -3630,11 +4466,15 @@ +@@ -3630,11 +4460,15 @@ variants: function dispatch: CPU: quantize_per_tensor_cpu @@ -3627,7 +3604,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: dequantize(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -3713,20 +4553,28 @@ +@@ -3713,20 +4547,28 @@ variants: method device_guard: False supports_named_tensor: True @@ -3656,7 +3633,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: meshgrid(Tensor[] tensors) -> Tensor[] -@@ -3765,6 +4613,8 @@ +@@ -3765,6 +4607,8 @@ dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda @@ -3665,7 +3642,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= variants: function supports_named_tensor: True -@@ -3791,10 +4641,16 @@ +@@ -3791,10 +4635,16 @@ # RNN cells and layers - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) @@ -3682,7 +3659,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) -@@ -3839,10 +4695,14 @@ +@@ -3839,10 +4689,14 @@ # PackedSequence utilities - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor) @@ -3697,7 +3674,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # wrappers for legacy TH methods -@@ -3852,6 +4712,8 @@ +@@ -3852,6 +4706,8 @@ dispatch: CPU: set_ CUDA: set_ @@ -3706,7 +3683,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) variants: method -@@ -3860,6 +4722,8 @@ +@@ -3860,6 +4716,8 @@ CPU: legacy::cpu::_th_set_ CUDA: legacy::cuda::_th_set_ QuantizedCPU: set_storage @@ -3715,7 +3692,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) variants: method -@@ -3867,12 +4731,16 @@ +@@ -3867,12 +4725,16 @@ dispatch: CPU: set_tensor_ CUDA: set_tensor_ @@ -3732,7 +3709,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!) variants: method -@@ -3892,6 +4760,8 @@ +@@ -3892,6 +4754,8 @@ dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda @@ -3741,7 +3718,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor -@@ -3904,6 +4774,8 @@ +@@ -3904,6 +4768,8 @@ dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda @@ -3750,7 +3727,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor -@@ -3916,6 +4788,8 @@ +@@ -3916,6 +4782,8 @@ dispatch: CPU: masked_scatter__cpu CUDA: masked_scatter__cuda @@ -3759,7 +3736,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor use_c10_dispatcher: full -@@ -3929,25 +4803,35 @@ +@@ -3929,25 +4797,35 @@ CUDA: view MkldnnCPU: mkldnn_view QuantizedCPU: view @@ -3795,7 +3772,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) variants: method -@@ -3955,11 +4839,15 @@ +@@ -3955,11 +4833,15 @@ dispatch: CPU: legacy::cpu::_th_index_fill_ CUDA: legacy::cuda::_th_index_fill_ @@ -3811,7 +3788,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) variants: method -@@ -3967,11 +4855,15 @@ +@@ -3967,11 +4849,15 @@ CPU: index_fill_ CUDA: index_fill_ supports_named_tensor: True @@ -3827,7 +3804,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!) variants: method -@@ -3994,6 +4886,8 @@ +@@ -3994,6 +4880,8 @@ dispatch: CPU: scatter_cpu_ CUDA: legacy::cuda::_th_scatter_ @@ -3836,7 +3813,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor use_c10_dispatcher: full -@@ -4004,6 +4898,8 @@ +@@ -4004,6 +4892,8 @@ dispatch: CPU: scatter_fill_cpu_ CUDA: legacy::cuda::_th_scatter_ @@ -3845,7 +3822,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor use_c10_dispatcher: full -@@ -4020,81 +4916,127 @@ +@@ -4020,81 +4910,127 @@ dispatch: CPU: scatter_add_cpu_ CUDA: legacy::cuda::_th_scatter_add_ @@ -3973,7 +3950,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) variants: method -@@ -4107,70 +5049,106 @@ +@@ -4107,70 +5043,106 @@ dispatch: CPU: bitwise_or_out CUDA: bitwise_or_out @@ -4080,7 +4057,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) variants: method -@@ -4240,18 +5218,24 @@ +@@ -4240,18 +5212,24 @@ - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) supports_named_tensor: True variants: method @@ -4105,7 +4082,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: digamma_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -4266,6 +5250,8 @@ +@@ -4266,6 +5244,8 @@ dispatch: CPU: legacy::cpu::_th_renorm_ CUDA: legacy::cuda::_th_renorm_ @@ -4114,7 +4091,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) supports_named_tensor: True -@@ -4273,6 +5259,8 @@ +@@ -4273,6 +5253,8 @@ dispatch: CPU: pow_ CUDA: pow_ @@ -4123,7 +4100,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) supports_named_tensor: True -@@ -4280,53 +5268,71 @@ +@@ -4280,53 +5262,71 @@ dispatch: CPU: pow_ CUDA: pow_ @@ -4195,7 +4172,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor use_c10_dispatcher: full -@@ -4334,28 +5340,40 @@ +@@ -4334,28 +5334,40 @@ dispatch: CPU: legacy::cpu::_th_addbmm CUDA: legacy::cuda::_th_addbmm @@ -4236,7 +4213,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) -@@ -4380,6 +5398,8 @@ +@@ -4380,6 +5392,8 @@ dispatch: CPU: legacy::cpu::_th_diag_out CUDA: legacy::cuda::_th_diag_out @@ -4245,7 +4222,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: diag(Tensor self, int diagonal=0) -> Tensor use_c10_dispatcher: full -@@ -4387,30 +5407,44 @@ +@@ -4387,30 +5401,44 @@ dispatch: CPU: legacy::cpu::_th_diag CUDA: legacy::cuda::_th_diag @@ -4290,7 +4267,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor dispatch: -@@ -4435,6 +5469,8 @@ +@@ -4435,6 +5463,8 @@ CPU: ne_out CUDA: ne_out QuantizedCPU: ne_out_quantized_cpu @@ -4299,7 +4276,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ne.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4444,6 +5480,8 @@ +@@ -4444,6 +5474,8 @@ CPU: ne CUDA: ne QuantizedCPU: ne_quantized_cpu @@ -4308,7 +4285,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4451,6 +5489,8 @@ +@@ -4451,6 +5483,8 @@ CPU: ne_out CUDA: ne_out QuantizedCPU: ne_out_quantized_cpu @@ -4317,7 +4294,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ne.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4460,6 +5500,8 @@ +@@ -4460,6 +5494,8 @@ CPU: ne CUDA: ne QuantizedCPU: ne_quantized_cpu @@ -4326,7 +4303,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4467,6 +5509,8 @@ +@@ -4467,6 +5503,8 @@ CPU: eq_out CUDA: eq_out QuantizedCPU: eq_out_quantized_cpu @@ -4335,7 +4312,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4476,6 +5520,8 @@ +@@ -4476,6 +5514,8 @@ CPU: eq CUDA: eq QuantizedCPU: eq_quantized_cpu @@ -4344,7 +4321,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4483,6 +5529,8 @@ +@@ -4483,6 +5523,8 @@ CPU: eq_out CUDA: eq_out QuantizedCPU: eq_out_quantized_cpu @@ -4353,7 +4330,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: eq.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4492,6 +5540,8 @@ +@@ -4492,6 +5534,8 @@ CPU: eq CUDA: eq QuantizedCPU: eq_quantized_cpu @@ -4362,7 +4339,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4499,6 +5549,8 @@ +@@ -4499,6 +5543,8 @@ CPU: ge_out CUDA: ge_out QuantizedCPU: ge_out_quantized_cpu @@ -4371,7 +4348,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4508,6 +5560,8 @@ +@@ -4508,6 +5554,8 @@ CPU: ge CUDA: ge QuantizedCPU: ge_quantized_cpu @@ -4380,7 +4357,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4515,6 +5569,8 @@ +@@ -4515,6 +5563,8 @@ CPU: ge_out CUDA: ge_out QuantizedCPU: ge_out_quantized_cpu @@ -4389,7 +4366,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: ge.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4524,6 +5580,8 @@ +@@ -4524,6 +5574,8 @@ CPU: ge CUDA: ge QuantizedCPU: ge_quantized_cpu @@ -4398,7 +4375,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4531,6 +5589,8 @@ +@@ -4531,6 +5583,8 @@ CPU: le_out CUDA: le_out QuantizedCPU: le_out_quantized_cpu @@ -4407,7 +4384,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4540,6 +5600,8 @@ +@@ -4540,6 +5594,8 @@ CPU: le CUDA: le QuantizedCPU: le_quantized_cpu @@ -4416,7 +4393,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4547,6 +5609,8 @@ +@@ -4547,6 +5603,8 @@ CPU: le_out CUDA: le_out QuantizedCPU: le_out_quantized_cpu @@ -4425,7 +4402,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: le.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4556,6 +5620,8 @@ +@@ -4556,6 +5614,8 @@ CPU: le CUDA: le QuantizedCPU: le_quantized_cpu @@ -4434,7 +4411,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4563,6 +5629,8 @@ +@@ -4563,6 +5623,8 @@ CPU: gt_out CUDA: gt_out QuantizedCPU: gt_out_quantized_cpu @@ -4443,7 +4420,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4572,6 +5640,8 @@ +@@ -4572,6 +5634,8 @@ CPU: gt CUDA: gt QuantizedCPU: gt_quantized_cpu @@ -4452,7 +4429,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4579,6 +5649,8 @@ +@@ -4579,6 +5643,8 @@ CPU: gt_out CUDA: gt_out QuantizedCPU: gt_out_quantized_cpu @@ -4461,7 +4438,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gt.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4588,6 +5660,8 @@ +@@ -4588,6 +5654,8 @@ CPU: gt CUDA: gt QuantizedCPU: gt_quantized_cpu @@ -4470,7 +4447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4595,6 +5669,8 @@ +@@ -4595,6 +5663,8 @@ CPU: lt_out CUDA: lt_out QuantizedCPU: lt_out_quantized_cpu @@ -4479,7 +4456,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Scalar(Tensor self, Scalar other) -> Tensor supports_named_tensor: True -@@ -4604,6 +5680,8 @@ +@@ -4604,6 +5674,8 @@ CPU: lt CUDA: lt QuantizedCPU: lt_quantized_cpu @@ -4488,7 +4465,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True -@@ -4611,6 +5689,8 @@ +@@ -4611,6 +5683,8 @@ CPU: lt_out CUDA: lt_out QuantizedCPU: lt_out_quantized_cpu @@ -4497,7 +4474,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lt.Tensor(Tensor self, Tensor other) -> Tensor supports_named_tensor: True -@@ -4620,11 +5700,16 @@ +@@ -4620,11 +5694,16 @@ CPU: lt CUDA: lt QuantizedCPU: lt_quantized_cpu @@ -4514,7 +4491,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: take(Tensor self, Tensor index) -> Tensor use_c10_dispatcher: full -@@ -4632,11 +5717,16 @@ +@@ -4632,11 +5711,16 @@ dispatch: CPU: legacy::cpu::_th_take CUDA: legacy::cuda::_th_take @@ -4531,7 +4508,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: index_select(Tensor self, int dim, Tensor index) -> Tensor use_c10_dispatcher: full -@@ -4646,17 +5736,25 @@ +@@ -4646,17 +5730,25 @@ CUDA: legacy::cuda::_th_index_select SparseCPU: index_select_sparse SparseCUDA: index_select_sparse @@ -4557,7 +4534,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: masked_select(Tensor self, Tensor mask) -> Tensor use_c10_dispatcher: full -@@ -4665,11 +5763,15 @@ +@@ -4665,11 +5757,15 @@ CPU: masked_select_cpu CUDA: masked_select_cuda supports_named_tensor: True @@ -4573,7 +4550,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: nonzero(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -4677,6 +5779,8 @@ +@@ -4677,6 +5773,8 @@ dispatch: CPU: legacy::cpu::_th_nonzero CUDA: legacy::cuda::_th_nonzero @@ -4582,7 +4559,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: nonzero_numpy(Tensor self) -> Tensor[] variants: method, function -@@ -4685,6 +5789,8 @@ +@@ -4685,6 +5783,8 @@ dispatch: CPU: gather_out_cpu CUDA: gather_out_cuda @@ -4591,7 +4568,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor use_c10_dispatcher: full -@@ -4692,34 +5798,50 @@ +@@ -4692,34 +5792,50 @@ dispatch: CPU: gather_cpu CUDA: gather_cuda @@ -4642,7 +4619,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) dispatch: -@@ -4826,9 +5948,13 @@ +@@ -4826,9 +5942,13 @@ CUDA: legacy::cuda::_th_potri - func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) @@ -4656,7 +4633,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _qr_helper(Tensor self, bool some) -> (Tensor, Tensor) variants: function -@@ -4891,12 +6017,16 @@ +@@ -4891,12 +6011,16 @@ dispatch: CPU: multinomial_out CUDA: multinomial_out @@ -4673,7 +4650,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor) variants: function -@@ -4947,6 +6077,8 @@ +@@ -4947,6 +6071,8 @@ dispatch: CPU: erfinv CUDA: erfinv @@ -4682,7 +4659,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: erfinv_(Tensor(a!) self) -> Tensor(a!) supports_named_tensor: True -@@ -4954,26 +6086,36 @@ +@@ -4954,26 +6080,36 @@ dispatch: CPU: _erfinv__cpu CUDA: _erfinv__cuda @@ -4719,7 +4696,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor use_c10_dispatcher: full -@@ -4981,21 +6123,29 @@ +@@ -4981,21 +6117,29 @@ - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) supports_named_tensor: True @@ -4749,7 +4726,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor use_c10_dispatcher: full -@@ -5003,6 +6153,8 @@ +@@ -5003,6 +6147,8 @@ dispatch: CPU: lerp_cpu_scalar CUDA: lerp_cuda_scalar @@ -4758,7 +4735,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor use_c10_dispatcher: full -@@ -5010,11 +6162,15 @@ +@@ -5010,6 +6156,8 @@ dispatch: CPU: lerp_cpu_tensor CUDA: lerp_cuda_tensor @@ -4767,21 +4744,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: legacy::cpu::_th_histc_out - CUDA: _histc_out_cuda -+ npu_dispatch: -+ NPU: histc_out_npu - - - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor - use_c10_dispatcher: full -@@ -5022,11 +6178,15 @@ - dispatch: - CPU: legacy::cpu::_th_histc - CUDA: _histc_cuda -+ npu_dispatch: -+ NPU: histc_npu - - - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +@@ -5027,6 +6175,8 @@ dispatch: CPU: fmod_out CUDA: legacy::cuda::_th_fmod_out @@ -4790,7 +4753,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full -@@ -5034,11 +6194,15 @@ +@@ -5034,11 +6184,15 @@ dispatch: CPU: fmod CUDA: legacy::cuda::_th_fmod @@ -4806,7 +4769,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -5046,11 +6210,15 @@ +@@ -5046,11 +6200,15 @@ dispatch: CPU: fmod CUDA: legacy::cuda::_th_fmod @@ -4822,7 +4785,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full -@@ -5058,11 +6226,15 @@ +@@ -5058,11 +6216,15 @@ dispatch: CPU: remainder CUDA: remainder @@ -4838,7 +4801,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full -@@ -5070,12 +6242,18 @@ +@@ -5070,12 +6232,18 @@ dispatch: CPU: remainder CUDA: remainder @@ -4857,7 +4820,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: min(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5084,13 +6262,19 @@ +@@ -5084,13 +6252,19 @@ CPU: min CUDA: legacy::cuda::_th_min QuantizedCPU: min_quant @@ -4877,7 +4840,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: max(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5099,6 +6283,8 @@ +@@ -5099,6 +6273,8 @@ CPU: max CUDA: legacy::cuda::_th_max QuantizedCPU: max_quant @@ -4886,7 +4849,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: median(Tensor self) -> Tensor -@@ -5107,12 +6293,16 @@ +@@ -5107,12 +6283,16 @@ dispatch: CPU: median_cpu CUDA: median_cuda @@ -4903,7 +4866,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) variants: method, function -@@ -5120,23 +6310,45 @@ +@@ -5120,23 +6300,45 @@ CPU: legacy::cpu::_th_sort CUDA: legacy::cuda::_th_sort QuantizedCPU: sort_quant @@ -4949,7 +4912,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) variants: method, function -@@ -5144,11 +6356,15 @@ +@@ -5144,11 +6346,15 @@ CPU: topk CUDA: topk QuantizedCPU: quantized_topk_cpu @@ -4965,7 +4928,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: any(Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5159,11 +6375,15 @@ +@@ -5159,11 +6365,15 @@ CUDA: any SparseCPU: any_sparse SparseCUDA: any_sparse @@ -4981,7 +4944,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor use_c10_dispatcher: full -@@ -5171,6 +6391,8 @@ +@@ -5171,6 +6381,8 @@ dispatch: CPU: legacy::cpu::_th_renorm CUDA: legacy::cuda::_th_renorm @@ -4990,7 +4953,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) variants: method -@@ -5178,6 +6400,8 @@ +@@ -5178,6 +6390,8 @@ dispatch: CPU: unfold CUDA: unfold @@ -4999,7 +4962,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: equal(Tensor self, Tensor other) -> bool use_c10_dispatcher: full -@@ -5186,6 +6410,8 @@ +@@ -5186,6 +6400,8 @@ CPU: legacy::cpu::_th_equal CUDA: legacy::cuda::_th_equal QuantizedCPU: quantized_equal @@ -5008,7 +4971,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!) -@@ -5193,6 +6419,8 @@ +@@ -5193,6 +6409,8 @@ dispatch: CPU: pow_out CUDA: pow_out @@ -5017,7 +4980,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor use_c10_dispatcher: full -@@ -5201,12 +6429,16 @@ +@@ -5201,12 +6419,16 @@ dispatch: CPU: pow CUDA: pow @@ -5034,7 +4997,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor use_c10_dispatcher: full -@@ -5214,6 +6446,8 @@ +@@ -5214,6 +6436,8 @@ dispatch: CPU: pow CUDA: pow @@ -5043,7 +5006,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) variants: method -@@ -5221,40 +6455,58 @@ +@@ -5221,40 +6445,58 @@ CPU: normal_cpu_ CUDA: normal_cuda_ supports_named_tensor: True @@ -5102,7 +5065,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: alias(Tensor(a) self) -> Tensor(a) variants: method, function -@@ -5265,16 +6517,22 @@ +@@ -5265,43 +6507,59 @@ dispatch: CPU: legacy::cpu::_th_addr CUDA: legacy::cuda::_th_addr @@ -5125,7 +5088,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) dispatch: -@@ -5286,22 +6544,30 @@ + CPU: legacy::cpu::_th_index_copy_ + CUDA: legacy::cuda::_th_index_copy_ +- ++ npu_dispatch: ++ NPU: index_copy_npu_ ++ + - func: _cumsum(Tensor self, int dim) -> Tensor + use_c10_dispatcher: full dispatch: CPU: _cumsum_cpu CUDA: legacy::cuda::_th_cumsum @@ -5156,7 +5126,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _var(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full -@@ -5309,6 +6575,8 @@ +@@ -5309,6 +6567,8 @@ CPU: legacy::cpu::_th_var CUDA: legacy::cuda::_th_var supports_named_tensor: True @@ -5165,7 +5135,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _std(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full -@@ -5321,6 +6589,8 @@ +@@ -5321,6 +6581,8 @@ variants: function dispatch: CUDA: _amp_non_finite_check_and_unscale_cuda_ @@ -5174,7 +5144,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor variants: function -@@ -5332,12 +6602,16 @@ +@@ -5332,12 +6594,16 @@ CPU: _cat_cpu CUDA: cat_cuda QuantizedCPU: quantized_cat @@ -5191,7 +5161,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor) dispatch: -@@ -5353,36 +6627,50 @@ +@@ -5353,36 +6619,50 @@ dispatch: CPU: legacy::cpu::_th_max CUDA: legacy::cuda::_th_max @@ -5242,7 +5212,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor use_c10_dispatcher: full -@@ -5390,23 +6678,33 @@ +@@ -5390,23 +6670,33 @@ dispatch: CPU: mse_loss_backward CUDA: mse_loss_backward @@ -5276,7 +5246,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5434,22 +6732,30 @@ +@@ -5434,22 +6724,30 @@ - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5307,7 +5277,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn -@@ -5466,97 +6772,137 @@ +@@ -5466,97 +6764,137 @@ - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5445,7 +5415,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5564,6 +6910,8 @@ +@@ -5564,6 +6902,8 @@ CPU: elu_out CUDA: elu_out QuantizedCPU: quantized_elu_out @@ -5454,7 +5424,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor use_c10_dispatcher: full -@@ -5572,16 +6920,22 @@ +@@ -5572,16 +6912,22 @@ CPU: elu CUDA: elu QuantizedCPU: quantized_elu @@ -5477,7 +5447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) python_module: nn -@@ -5589,12 +6943,16 @@ +@@ -5589,12 +6935,16 @@ CPU: elu_ CUDA: elu_ QuantizedCPU: quantized_elu_ @@ -5494,7 +5464,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: glu(Tensor self, int dim=-1) -> Tensor use_c10_dispatcher: full -@@ -5602,12 +6960,16 @@ +@@ -5602,12 +6952,16 @@ dispatch: CPU: glu CUDA: legacy::cuda::_thnn_glu_forward @@ -5511,7 +5481,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor use_c10_dispatcher: full -@@ -5615,20 +6977,30 @@ +@@ -5615,20 +6969,30 @@ dispatch: CPU: glu_backward CUDA: legacy::cuda::_thnn_glu_backward @@ -5542,7 +5512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5636,6 +7008,8 @@ +@@ -5636,6 +7000,8 @@ CPU: hardtanh_out CUDA: hardtanh_out QuantizedCPU: quantized_hardtanh_out @@ -5551,7 +5521,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor use_c10_dispatcher: full -@@ -5644,16 +7018,22 @@ +@@ -5644,16 +7010,22 @@ CPU: hardtanh CUDA: hardtanh QuantizedCPU: quantized_hardtanh @@ -5574,7 +5544,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) python_module: nn -@@ -5661,6 +7041,8 @@ +@@ -5661,6 +7033,8 @@ CPU: hardtanh_ CUDA: hardtanh_ QuantizedCPU: quantized_hardtanh_ @@ -5583,7 +5553,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5668,6 +7050,8 @@ +@@ -5668,6 +7042,8 @@ CPU: leaky_relu_out CUDA: leaky_relu_out QuantizedCPU: quantized_leaky_relu_out @@ -5592,7 +5562,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor use_c10_dispatcher: full -@@ -5676,10 +7060,14 @@ +@@ -5676,10 +7052,14 @@ CPU: leaky_relu CUDA: leaky_relu QuantizedCPU: quantized_leaky_relu @@ -5607,7 +5577,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) python_module: nn -@@ -5687,31 +7075,44 @@ +@@ -5687,31 +7067,44 @@ CPU: leaky_relu_ CUDA: leaky_relu_ QuantizedCPU: quantized_leaky_relu_ @@ -5652,7 +5622,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor use_c10_dispatcher: full -@@ -5719,6 +7120,8 @@ +@@ -5719,6 +7112,8 @@ dispatch: CPU: log_sigmoid_backward_cpu CUDA: legacy::cuda::_thnn_log_sigmoid_backward @@ -5661,7 +5631,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5744,37 +7147,53 @@ +@@ -5744,37 +7139,53 @@ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -5715,7 +5685,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -5782,9 +7201,13 @@ +@@ -5782,9 +7193,13 @@ CPU: adaptive_avg_pool2d_out_cpu CUDA: adaptive_avg_pool2d_out_cuda MkldnnCPU: mkldnn_adaptive_avg_pool2d_out @@ -5729,7 +5699,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor dispatch: -@@ -5796,6 +7219,8 @@ +@@ -5796,6 +7211,8 @@ CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda QuantizedCPU: quantized_adaptive_avg_pool2d @@ -5738,7 +5708,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5803,24 +7228,32 @@ +@@ -5803,24 +7220,32 @@ dispatch: CPU: adaptive_avg_pool2d_backward_cpu CUDA: adaptive_avg_pool2d_backward_cuda @@ -5771,7 +5741,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full -@@ -5828,6 +7261,8 @@ +@@ -5828,6 +7253,8 @@ dispatch: CPU: adaptive_avg_pool3d_backward_cpu CUDA: adaptive_avg_pool3d_backward_cuda @@ -5780,7 +5750,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -5835,6 +7270,8 @@ +@@ -5835,6 +7262,8 @@ dispatch: CPU: adaptive_max_pool2d_out_cpu CUDA: adaptive_max_pool2d_out_cuda @@ -5789,7 +5759,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor) -@@ -5842,12 +7279,16 @@ +@@ -5842,12 +7271,16 @@ dispatch: CPU: adaptive_max_pool2d_cpu CUDA: adaptive_max_pool2d_cuda @@ -5806,7 +5776,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor use_c10_dispatcher: full -@@ -5855,6 +7296,8 @@ +@@ -5855,6 +7288,8 @@ dispatch: CPU: adaptive_max_pool2d_backward_cpu CUDA: adaptive_max_pool2d_backward_cuda @@ -5815,7 +5785,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -5889,6 +7332,8 @@ +@@ -5889,6 +7324,8 @@ CPU: avg_pool2d_out_cpu CUDA: avg_pool2d_out_cuda MkldnnCPU: mkldnn_avg_pool2d_out @@ -5824,7 +5794,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor python_module: nn -@@ -5897,24 +7342,32 @@ +@@ -5897,24 +7334,32 @@ CUDA: avg_pool2d_cuda MkldnnCPU: mkldnn_avg_pool2d QuantizedCPU: quantized_avg_pool2d @@ -5857,7 +5827,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor python_module: nn -@@ -5922,18 +7375,24 @@ +@@ -5922,18 +7367,24 @@ CPU: avg_pool3d_cpu CUDA: avg_pool3d_cuda QuantizedCPU: quantized_avg_pool3d @@ -5882,7 +5852,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -5993,6 +7452,8 @@ +@@ -5993,6 +7444,8 @@ dispatch: CPU: max_pool2d_with_indices_out_cpu CUDA: max_pool2d_with_indices_out_cuda @@ -5891,7 +5861,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) -@@ -6000,6 +7461,8 @@ +@@ -6000,6 +7453,8 @@ dispatch: CPU: max_pool2d_with_indices_cpu CUDA: max_pool2d_with_indices_cuda @@ -5900,7 +5870,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) -@@ -6007,12 +7470,16 @@ +@@ -6007,12 +7462,16 @@ dispatch: CPU: max_pool2d_with_indices_backward_out_cpu CUDA: max_pool2d_with_indices_backward_out_cuda @@ -5917,7 +5887,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!)) -@@ -6020,6 +7487,8 @@ +@@ -6020,6 +7479,8 @@ dispatch: CPU: max_pool3d_with_indices_out_cpu CUDA: max_pool3d_with_indices_out_cuda @@ -5926,7 +5896,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Return: (Tensor output, Tensor indices) - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) -@@ -6027,6 +7496,8 @@ +@@ -6027,6 +7488,8 @@ dispatch: CPU: max_pool3d_with_indices_cpu CUDA: max_pool3d_with_indices_cuda @@ -5935,7 +5905,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= supports_named_tensor: True - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) -@@ -6034,12 +7505,17 @@ +@@ -6034,12 +7497,17 @@ dispatch: CPU: max_pool3d_with_indices_backward_out_cpu CUDA: max_pool3d_with_indices_backward_out_cuda @@ -5953,7 +5923,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6118,12 +7594,16 @@ +@@ -6118,12 +7586,16 @@ dispatch: CPU: reflection_pad2d_out_cpu CUDA: reflection_pad2d_out_cuda @@ -5970,7 +5940,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn -@@ -6166,12 +7646,16 @@ +@@ -6166,12 +7638,16 @@ dispatch: CPU: replication_pad2d_out_cpu CUDA: replication_pad2d_out_cuda @@ -5987,7 +5957,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn -@@ -6214,12 +7698,16 @@ +@@ -6214,12 +7690,16 @@ dispatch: CPU: upsample_linear1d_out_cpu CUDA: upsample_linear1d_out_cuda @@ -6004,7 +5974,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn -@@ -6232,12 +7720,16 @@ +@@ -6232,12 +7712,16 @@ dispatch: CPU: upsample_linear1d_backward_cpu CUDA: upsample_linear1d_backward_cuda @@ -6021,7 +5991,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn -@@ -6245,96 +7737,128 @@ +@@ -6245,96 +7729,128 @@ CPU: upsample_bilinear2d_cpu CUDA: upsample_bilinear2d_cuda QuantizedCPU: quantized_upsample_bilinear2d_cpu @@ -6150,7 +6120,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn -@@ -6342,24 +7866,32 @@ +@@ -6342,24 +7858,32 @@ CPU: upsample_nearest2d_cpu CUDA: upsample_nearest2d_cuda QuantizedCPU: quantized_upsample_nearest2d_cpu @@ -6183,7 +6153,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn -@@ -6367,38 +7899,52 @@ +@@ -6367,38 +7891,52 @@ CPU: upsample_nearest3d_cpu CUDA: upsample_nearest3d_cuda QuantizedCPU: quantized_upsample_nearest3d_cpu @@ -6236,7 +6206,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # What's a thnn_conv_ versus a slow_conv_? # -@@ -6423,24 +7969,32 @@ +@@ -6423,24 +7961,32 @@ dispatch: CPU: slow_conv_transpose2d_out_cpu CUDA: slow_conv_transpose2d_out_cuda @@ -6269,7 +6239,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6468,21 +8022,29 @@ +@@ -6468,21 +8014,29 @@ - func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -6299,7 +6269,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!)) python_module: nn -@@ -6495,32 +8057,46 @@ +@@ -6495,32 +8049,46 @@ dispatch: CPU: slow_conv2d_backward_cpu CUDA: legacy::cuda::_thnn_conv2d_backward @@ -6346,7 +6316,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn -@@ -6553,12 +8129,16 @@ +@@ -6553,12 +8121,16 @@ dispatch: CPU: slow_conv_dilated2d_cpu CUDA: slow_conv_dilated2d_cuda @@ -6363,7 +6333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor python_module: nn -@@ -6577,57 +8157,393 @@ +@@ -6577,57 +8149,401 @@ dispatch: CPU: col2im_out_cpu CUDA: col2im_out_cuda @@ -6548,7 +6518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + npu_dispatch_only: + NPU: nms_v4_npu + -+- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) ++- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) + variants: function + npu_dispatch_only: + NPU: lstm_npu @@ -6757,10 +6727,18 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + variants: function, method + npu_dispatch_only: + NPU: masked_fill_range_npu ++ ++- func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor ++ npu_dispatch_only: ++ NPU: linear_npu ++ ++- func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor) ++ npu_dispatch_only: ++ NPU: linear_backward_npu \ No newline at end of file diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S --- pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S 2021-07-05 14:59:26.496336915 +0800 ++++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S 2021-07-09 17:16:47.866792783 +0800 @@ -659,14 +659,14 @@ SUB x1, x1, 4 @@ -6786,7 +6764,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= CMP x1, 2 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp --- pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp 2021-07-05 14:59:26.440336488 +0800 ++++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp 2021-07-09 17:16:47.810790775 +0800 @@ -64,7 +64,7 @@ Tensor isinf(const Tensor &self) { @@ -6798,7 +6776,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "isinf", [&]() { diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp --- pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp 2021-07-05 14:59:26.444336518 +0800 ++++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp 2021-07-09 17:16:47.810790775 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -6843,7 +6821,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp --- pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp 2021-07-05 14:59:26.444336518 +0800 ++++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp 2021-07-09 17:16:47.810790775 +0800 @@ -87,6 +87,7 @@ if (self.is_contiguous(memory_format)) { return self; @@ -6854,7 +6832,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= "preserve memory format is unsupported by the contiguous operator"); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp --- pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp 2021-07-05 14:59:26.444336518 +0800 ++++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp 2021-07-09 17:16:47.814790918 +0800 @@ -26,7 +26,7 @@ const scalar_t* in = &idata[output_y * input_width + output_x]; scalar_t* out = &odata[output_y * output_width + output_x]; @@ -6866,7 +6844,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= out += output_width * output_height; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py --- pytorch-v1.5.0/aten/src/ATen/native_parse.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/native_parse.py 2021-07-05 14:59:26.512337037 +0800 ++++ pytorch-develop/aten/src/ATen/native_parse.py 2021-07-09 17:16:47.878793213 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -6904,7 +6882,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= msg = '''Exception raised in processing function: diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py --- pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/preprocess_declarations.py 2021-07-05 14:59:26.512337037 +0800 ++++ pytorch-develop/aten/src/ATen/preprocess_declarations.py 2021-07-09 17:16:47.882793357 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -6936,7 +6914,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h --- pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/templates/TensorBody.h 2021-07-05 14:59:26.512337037 +0800 ++++ pytorch-develop/aten/src/ATen/templates/TensorBody.h 2021-07-09 17:16:47.882793357 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -6969,7 +6947,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h --- pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h 2021-07-05 14:59:26.512337037 +0800 ++++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h 2021-07-09 17:16:47.882793357 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7003,7 +6981,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt --- pytorch-v1.5.0/aten/src/TH/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/TH/CMakeLists.txt 2021-07-05 14:59:26.516337067 +0800 ++++ pytorch-develop/aten/src/TH/CMakeLists.txt 2021-07-09 17:16:47.886793500 +0800 @@ -48,6 +48,11 @@ ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE) @@ -7018,7 +6996,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/TH/generic/THStorage.cpp 2021-07-05 14:59:26.520337098 +0800 ++++ pytorch-develop/aten/src/TH/generic/THStorage.cpp 2021-07-09 17:16:47.886793500 +0800 @@ -1,9 +1,32 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7127,7 +7105,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/aten/src/TH/generic/THStorage.h 2021-07-05 14:59:26.520337098 +0800 ++++ pytorch-develop/aten/src/TH/generic/THStorage.h 2021-07-09 17:16:47.886793500 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7166,7 +7144,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt --- pytorch-v1.5.0/c10/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/CMakeLists.txt 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/CMakeLists.txt 2021-07-09 17:16:47.902794074 +0800 @@ -63,6 +63,14 @@ message(STATUS "don't use NUMA") endif() @@ -7195,7 +7173,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # not checked in diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h --- pytorch-v1.5.0/c10/core/Backend.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Backend.h 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/Backend.h 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7290,7 +7268,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp --- pytorch-v1.5.0/c10/core/Device.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Device.cpp 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/Device.cpp 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7330,7 +7308,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= types.begin(), diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h --- pytorch-v1.5.0/c10/core/Device.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Device.h 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/Device.h 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7365,7 +7343,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return type_ == DeviceType::CPU; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp --- pytorch-v1.5.0/c10/core/DeviceType.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DeviceType.cpp 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/DeviceType.cpp 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7405,7 +7383,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return false; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h --- pytorch-v1.5.0/c10/core/DeviceType.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DeviceType.h 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/DeviceType.h 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7448,7 +7426,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= constexpr DeviceType kXLA = DeviceType::XLA; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp --- pytorch-v1.5.0/c10/core/DispatchKey.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DispatchKey.cpp 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/DispatchKey.cpp 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7480,7 +7458,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= case DispatchKey::TESTING_ONLY_GenericModeTensorId: diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h --- pytorch-v1.5.0/c10/core/DispatchKey.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/DispatchKey.h 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/DispatchKey.h 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7512,7 +7490,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h --- pytorch-v1.5.0/c10/core/Storage.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/Storage.h 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/Storage.h 2021-07-09 17:16:47.902794074 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7546,7 +7524,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= }; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h --- pytorch-v1.5.0/c10/core/StorageImpl.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/StorageImpl.h 2021-07-05 14:59:26.532337189 +0800 ++++ pytorch-develop/c10/core/StorageImpl.h 2021-07-09 17:16:47.902794074 +0800 @@ -1,12 +1,39 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7603,7 +7581,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h --- pytorch-v1.5.0/c10/core/TensorImpl.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/TensorImpl.h 2021-07-05 14:59:26.536337219 +0800 ++++ pytorch-develop/c10/core/TensorImpl.h 2021-07-09 17:16:47.906794218 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7673,7 +7651,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h --- pytorch-v1.5.0/c10/core/TensorOptions.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/core/TensorOptions.h 2021-07-05 14:59:26.536337219 +0800 ++++ pytorch-develop/c10/core/TensorOptions.h 2021-07-09 17:16:47.906794218 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7714,7 +7692,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h --- pytorch-v1.5.0/c10/macros/Export.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/c10/macros/Export.h 2021-07-05 14:59:26.536337219 +0800 ++++ pytorch-develop/c10/macros/Export.h 2021-07-09 17:16:47.906794218 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -7841,7 +7819,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt --- pytorch-v1.5.0/caffe2/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/caffe2/CMakeLists.txt 2021-07-05 14:59:26.544337280 +0800 ++++ pytorch-develop/caffe2/CMakeLists.txt 2021-07-09 17:16:47.918794647 +0800 @@ -32,6 +32,7 @@ # Add source, includes, and libs to lists list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS}) @@ -7988,7 +7966,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs. diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format --- pytorch-v1.5.0/.clang-format 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/.clang-format 2021-07-05 14:59:26.412336274 +0800 ++++ pytorch-develop/.clang-format 2021-07-09 17:16:47.778789628 +0800 @@ -84,5 +84,4 @@ SpacesInSquareBrackets: false Standard: Cpp11 @@ -7999,7 +7977,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= \ No newline at end of file diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake --- pytorch-v1.5.0/cmake/BuildVariables.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/BuildVariables.cmake 2021-07-05 14:59:26.652338104 +0800 ++++ pytorch-develop/cmake/BuildVariables.cmake 2021-07-09 17:16:48.030798663 +0800 @@ -11,6 +11,7 @@ # CMakeLists.txt files under each folder respectively. set(Caffe2_CPU_SRCS) @@ -8026,7 +8004,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # symbols. However, if the lib is whole linked in caffe2 lib, we don't want diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake --- pytorch-v1.5.0/cmake/Codegen.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/Codegen.cmake 2021-07-05 14:59:26.656338135 +0800 ++++ pytorch-develop/cmake/Codegen.cmake 2021-07-09 17:16:48.030798663 +0800 @@ -191,13 +191,14 @@ file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp) file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp) @@ -8057,7 +8035,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= endif() diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake --- pytorch-v1.5.0/cmake/Dependencies.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/Dependencies.cmake 2021-07-05 14:59:26.656338135 +0800 ++++ pytorch-develop/cmake/Dependencies.cmake 2021-07-09 17:16:48.030798663 +0800 @@ -1509,6 +1509,13 @@ ENDIF(NOT C_HAS_THREAD) endif() @@ -8074,7 +8052,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake --- pytorch-v1.5.0/cmake/Summary.cmake 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/Summary.cmake 2021-07-05 14:59:26.656338135 +0800 ++++ pytorch-develop/cmake/Summary.cmake 2021-07-09 17:16:48.034798807 +0800 @@ -134,6 +134,7 @@ if(NOT "${SELECTED_OP_LIST}" STREQUAL "") message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}") @@ -8085,7 +8063,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= endfunction() diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in --- pytorch-v1.5.0/cmake/TorchConfig.cmake.in 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/cmake/TorchConfig.cmake.in 2021-07-05 14:59:26.656338135 +0800 ++++ pytorch-develop/cmake/TorchConfig.cmake.in 2021-07-09 17:16:48.034798807 +0800 @@ -112,6 +112,11 @@ list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES}) endif() @@ -8100,7 +8078,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@") diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt --- pytorch-v1.5.0/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/CMakeLists.txt 2021-07-05 14:59:26.412336274 +0800 ++++ pytorch-develop/CMakeLists.txt 2021-07-09 17:16:47.782789771 +0800 @@ -205,6 +205,10 @@ option(USE_TBB "Use TBB" OFF) option(ONNX_ML "Enable traditional ONNX ML API." ON) @@ -8167,7 +8145,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces") diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore --- pytorch-v1.5.0/.dockerignore 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/.dockerignore 2021-07-05 14:59:26.412336274 +0800 ++++ pytorch-develop/.dockerignore 2021-07-09 17:16:47.778789628 +0800 @@ -1,257 +1 @@ -# READ THIS BEFORE YOU REFACTOR ME -# @@ -8430,7 +8408,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= \ No newline at end of file diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat --- pytorch-v1.5.0/docs/make.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/docs/make.bat 2021-07-05 14:59:26.660338165 +0800 ++++ pytorch-develop/docs/make.bat 2021-07-09 17:16:48.038798950 +0800 @@ -1,36 +1,36 @@ -@ECHO OFF - @@ -8519,7 +8497,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= \ No newline at end of file diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt --- pytorch-v1.5.0/requirements.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/requirements.txt 2021-07-05 14:59:26.676338287 +0800 ++++ pytorch-develop/requirements.txt 2021-07-09 17:16:48.054799524 +0800 @@ -4,4 +4,12 @@ requests setuptools @@ -8538,7 +8516,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= \ No newline at end of file diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat --- pytorch-v1.5.0/scripts/appveyor/install.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/appveyor/install.bat 2021-07-05 14:59:26.676338287 +0800 ++++ pytorch-develop/scripts/appveyor/install.bat 2021-07-09 17:16:48.054799524 +0800 @@ -1,10 +1,10 @@ -:: Installation scripts for appveyor. - @@ -8562,7 +8540,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +conda install -y numpy diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat --- pytorch-v1.5.0/scripts/appveyor/install_cuda.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/appveyor/install_cuda.bat 2021-07-05 14:59:26.676338287 +0800 ++++ pytorch-develop/scripts/appveyor/install_cuda.bat 2021-07-09 17:16:48.054799524 +0800 @@ -1,22 +1,22 @@ -@echo on - @@ -8610,7 +8588,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +nvcc -V || exit /b diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat --- pytorch-v1.5.0/scripts/build_windows.bat 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/build_windows.bat 2021-07-05 14:59:26.676338287 +0800 ++++ pytorch-develop/scripts/build_windows.bat 2021-07-09 17:16:48.054799524 +0800 @@ -1,84 +1,84 @@ -:: ############################################################################# -:: Example command to build on Windows. @@ -8782,7 +8760,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +exit /b 1 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1 --- pytorch-v1.5.0/scripts/proto.ps1 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/scripts/proto.ps1 2021-07-05 14:59:26.676338287 +0800 ++++ pytorch-develop/scripts/proto.ps1 2021-07-09 17:16:48.054799524 +0800 @@ -1,17 +1,17 @@ -param( - [string]$protoc, @@ -8820,7 +8798,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +Invoke-Expression $cmd diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/setup.py pytorch-develop/setup.py --- pytorch-v1.5.0/setup.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/setup.py 2021-07-05 14:59:26.676338287 +0800 ++++ pytorch-develop/setup.py 2021-07-09 17:16:48.054799524 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -8919,7 +8897,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= 'python/serialized_test/data/operator_test/*.zip', diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml --- pytorch-v1.5.0/tools/autograd/derivatives.yaml 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/derivatives.yaml 2021-07-05 14:59:27.812346954 +0800 ++++ pytorch-develop/tools/autograd/derivatives.yaml 2021-07-09 17:16:49.194840399 +0800 @@ -107,6 +107,10 @@ # # NB: The parameter names here MUST be consistent with the parameter names @@ -8976,12 +8954,12 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= # The above backward definitions are equivalent to the definitions below. Why do we bundle # everything up? It's because it's more convenient to define double backwards # when there is a single function that manages everything. -@@ -1630,3 +1643,48 @@ +@@ -1630,3 +1643,52 @@ - name: nonzero(Tensor self) -> Tensor output_differentiability: [False] + -+- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) ++- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) + output_differentiability: [True, True, True, False, False, False, False, False] + input, weight, bias, h, c: npu_lstm_backward(grads[0], grads[1], grads[2], input, weight, bias, h, c, result0, result1, result2, result3, result4, result5, result6, result7) + @@ -9025,10 +9003,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + +- name: npu_mish(Tensor self) -> Tensor + self: npu_mish_backward(grad, self) ++ ++- name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor ++ input, weight: npu_linear_backward(grad, input, weight) ++ bias: maybe_multiply(grad, 1) \ No newline at end of file diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py --- pytorch-v1.5.0/tools/autograd/dump_utils.py 1970-01-01 08:00:00.000000000 +0800 -+++ pytorch-develop/tools/autograd/dump_utils.py 2021-07-05 14:59:27.812346954 +0800 ++++ pytorch-develop/tools/autograd/dump_utils.py 2021-07-09 17:16:49.194840399 +0800 @@ -0,0 +1,112 @@ +# Copyright (c) 2021 Huawei Technologies Co., Ltd +# All rights reserved. @@ -9144,7 +9126,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +] diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py --- pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/gen_autograd_functions.py 2021-07-05 14:59:27.812346954 +0800 ++++ pytorch-develop/tools/autograd/gen_autograd_functions.py 2021-07-09 17:16:49.194840399 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2021 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -9330,7 +9312,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py --- pytorch-v1.5.0/tools/autograd/gen_python_functions.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/gen_python_functions.py 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/autograd/gen_python_functions.py 2021-07-09 17:16:49.194840399 +0800 @@ -1,3 +1,20 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -9372,7 +9354,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= 'value': argname, diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py --- pytorch-v1.5.0/tools/autograd/gen_variable_type.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/gen_variable_type.py 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/autograd/gen_variable_type.py 2021-07-09 17:16:49.194840399 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2021 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -9545,7 +9527,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp --- pytorch-v1.5.0/tools/autograd/templates/Functions.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/Functions.cpp 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/autograd/templates/Functions.cpp 2021-07-09 17:16:49.194840399 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2021 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -9625,7 +9607,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= auto sparse = sparse_.coalesce(); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp --- pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp 2021-07-09 17:16:49.194840399 +0800 @@ -22,7 +22,7 @@ #include "torch/csrc/autograd/generated/variable_factories.h" #include "torch/csrc/utils/structseq.h" @@ -9709,7 +9691,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp --- pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp 2021-07-09 17:16:49.194840399 +0800 @@ -15,7 +15,13 @@ #include "torch/csrc/cuda/Stream.h" #include "torch/csrc/cuda/Event.h" @@ -9796,7 +9778,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= {"has_names", (PyCFunction)THPVariable_has_names, METH_NOARGS, NULL}, diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp --- pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/VariableType.cpp 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/autograd/templates/VariableType.cpp 2021-07-09 17:16:49.194840399 +0800 @@ -1,7 +1,27 @@ +// Copyright (c) 2021 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -9827,7 +9809,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h --- pytorch-v1.5.0/tools/autograd/templates/VariableType.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/autograd/templates/VariableType.h 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/autograd/templates/VariableType.h 2021-07-09 17:16:49.194840399 +0800 @@ -1,3 +1,20 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -9859,7 +9841,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= const at::Tensor & unpack(const Tensor & t, const char * name, int pos); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl --- pytorch-v1.5.0/tools/build_variables.bzl 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/tools/build_variables.bzl 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/tools/build_variables.bzl 2021-07-09 17:16:49.198840543 +0800 @@ -46,6 +46,7 @@ "torch/csrc/autograd/functions/utils.cpp", "torch/csrc/autograd/input_buffer.cpp", @@ -9945,7 +9927,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py --- pytorch-v1.5.0/torch/autograd/profiler.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/autograd/profiler.py 2021-07-05 14:59:27.820347015 +0800 ++++ pytorch-develop/torch/autograd/profiler.py 2021-07-09 17:16:49.202840686 +0800 @@ -1,8 +1,25 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -10418,7 +10400,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return ''.join(result) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt --- pytorch-v1.5.0/torch/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/CMakeLists.txt 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/torch/CMakeLists.txt 2021-07-09 17:16:49.198840543 +0800 @@ -97,6 +97,7 @@ ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp ${TORCH_SRC_DIR}/csrc/utils.cpp @@ -10450,7 +10432,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= endif() diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp --- pytorch-v1.5.0/torch/csrc/autograd/engine.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/engine.cpp 2021-07-05 14:59:27.832347106 +0800 ++++ pytorch-develop/torch/csrc/autograd/engine.cpp 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10573,7 +10555,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= auto event = c10::Event{c10::DeviceType::CUDA}; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp --- pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp 2021-07-05 14:59:27.832347106 +0800 ++++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10605,7 +10587,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= /*non_blocking=*/false, diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp --- pytorch-v1.5.0/torch/csrc/autograd/init.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/init.cpp 2021-07-05 14:59:27.832347106 +0800 ++++ pytorch-develop/torch/csrc/autograd/init.cpp 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10648,7 +10630,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= m.def("_enable_profiler", enableProfiler); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp --- pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp 2021-07-05 14:59:27.832347106 +0800 ++++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10700,7 +10682,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= auto& old_var = buffer[pos]; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp --- pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/profiler.cpp 2021-07-05 14:59:27.832347106 +0800 ++++ pytorch-develop/torch/csrc/autograd/profiler.cpp 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -10896,7 +10878,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= CUDAStubs::~CUDAStubs() = default; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h --- pytorch-v1.5.0/torch/csrc/autograd/profiler.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/profiler.h 2021-07-05 14:59:27.832347106 +0800 ++++ pytorch-develop/torch/csrc/autograd/profiler.h 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11021,7 +11003,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp --- pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/python_variable.cpp 2021-07-05 14:59:27.836347137 +0800 ++++ pytorch-develop/torch/csrc/autograd/python_variable.cpp 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11075,7 +11057,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr}, diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp --- pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp 2021-07-05 14:59:27.836347137 +0800 ++++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp 2021-07-09 17:16:49.214841116 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11116,7 +11098,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h --- pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h 2021-07-05 14:59:27.836347137 +0800 ++++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h 2021-07-09 17:16:49.214841116 +0800 @@ -168,6 +168,45 @@ return r.release(); } @@ -11165,7 +11147,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if (!r) throw python_error(); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp --- pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp 2021-07-05 14:59:27.832347106 +0800 ++++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp 2021-07-09 17:16:49.210840973 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11199,7 +11181,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= if (!t.defined()) { diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp --- pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp 2021-07-05 14:59:27.836347137 +0800 ++++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp 2021-07-09 17:16:49.218841259 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11305,7 +11287,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= while (!in_flight.empty()) { diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp --- pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp 2021-07-05 14:59:27.836347137 +0800 ++++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp 2021-07-09 17:16:49.218841259 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11362,7 +11344,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= .def("is_success", &::c10d::ProcessGroup::Work::isSuccess) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp --- pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp 2021-07-05 14:59:27.836347137 +0800 ++++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp 2021-07-09 17:16:49.218841259 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11487,7 +11469,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp --- pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/DynamicTypes.cpp 2021-07-05 14:59:27.824347045 +0800 ++++ pytorch-develop/torch/csrc/DynamicTypes.cpp 2021-07-09 17:16:49.202840686 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11536,7 +11518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return it->second; diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp --- pytorch-v1.5.0/torch/csrc/Generator.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/Generator.cpp 2021-07-05 14:59:27.824347045 +0800 ++++ pytorch-develop/torch/csrc/Generator.cpp 2021-07-09 17:16:49.202840686 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11604,7 +11586,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= #endif diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp --- pytorch-v1.5.0/torch/csrc/generic/serialization.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/generic/serialization.cpp 2021-07-05 14:59:27.840347168 +0800 ++++ pytorch-develop/torch/csrc/generic/serialization.cpp 2021-07-09 17:16:49.222841403 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11704,7 +11686,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp --- pytorch-v1.5.0/torch/csrc/generic/Storage.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/generic/Storage.cpp 2021-07-05 14:59:27.840347168 +0800 ++++ pytorch-develop/torch/csrc/generic/Storage.cpp 2021-07-09 17:16:49.218841259 +0800 @@ -1,7 +1,25 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11783,7 +11765,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= for (Py_ssize_t i = 0; i < length; i++) { diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp --- pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp 2021-07-05 14:59:27.840347168 +0800 ++++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp 2021-07-09 17:16:49.222841403 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11831,7 +11813,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, nullptr}, diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp --- pytorch-v1.5.0/torch/csrc/Module.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/Module.cpp 2021-07-05 14:59:27.824347045 +0800 ++++ pytorch-develop/torch/csrc/Module.cpp 2021-07-09 17:16:49.202840686 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -11975,7 +11957,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) { diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp --- pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp 2021-07-05 14:59:27.860347320 +0800 ++++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp 2021-07-09 17:16:49.242842120 +0800 @@ -1,18 +1,35 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12352,7 +12334,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= +} // namespace torch diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp --- pytorch-v1.5.0/torch/csrc/utils/init.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/init.cpp 2021-07-05 14:59:27.860347320 +0800 ++++ pytorch-develop/torch/csrc/utils/init.cpp 2021-07-09 17:16:49.242842120 +0800 @@ -1,6 +1,10 @@ #include #include @@ -12440,7 +12422,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } // namespace torch diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h --- pytorch-v1.5.0/torch/csrc/utils/init.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/init.h 2021-07-05 14:59:27.860347320 +0800 ++++ pytorch-develop/torch/csrc/utils/init.h 2021-07-09 17:16:49.242842120 +0800 @@ -8,4 +8,7 @@ void initThroughputBenchmarkBindings(PyObject* module); @@ -12451,7 +12433,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } // namespace torch diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h --- pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/python_arg_parser.h 2021-07-05 14:59:27.864347350 +0800 ++++ pytorch-develop/torch/csrc/utils/python_arg_parser.h 2021-07-09 17:16:49.242842120 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12486,7 +12468,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return at::Device(device_str); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp --- pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp 2021-07-05 14:59:27.864347350 +0800 ++++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp 2021-07-09 17:16:49.242842120 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12517,7 +12499,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= registerLayoutObject((THPLayout*)strided_layout, at::Backend::QuantizedCPU); diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp --- pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/tensor_new.cpp 2021-07-05 14:59:27.864347350 +0800 ++++ pytorch-develop/torch/csrc/utils/tensor_new.cpp 2021-07-09 17:16:49.242842120 +0800 @@ -1,3 +1,19 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12653,7 +12635,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= } else if(expected_layout == c10::kSparse) { diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp --- pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/csrc/utils/tensor_types.cpp 2021-07-05 14:59:27.864347350 +0800 ++++ pytorch-develop/torch/csrc/utils/tensor_types.cpp 2021-07-09 17:16:49.242842120 +0800 @@ -1,58 +1,91 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. @@ -12866,7 +12848,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -def get_rng_state(): ... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py --- pytorch-v1.5.0/torch/distributed/distributed_c10d.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/distributed/distributed_c10d.py 2021-07-05 14:59:27.864347350 +0800 ++++ pytorch-develop/torch/distributed/distributed_c10d.py 2021-07-09 17:16:49.246842264 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -12947,7 +12929,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py --- pytorch-v1.5.0/torch/distributions/von_mises.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/distributions/von_mises.py 2021-07-05 14:59:27.868347381 +0800 ++++ pytorch-develop/torch/distributions/von_mises.py 2021-07-09 17:16:49.246842264 +0800 @@ -1,140 +1,140 @@ -from __future__ import absolute_import, division, print_function - @@ -13231,7 +13213,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= + _log_modified_bessel_fn(self.concentration, order=0)).exp() diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py --- pytorch-v1.5.0/torch/__init__.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/__init__.py 2021-07-05 14:59:27.816346984 +0800 ++++ pytorch-develop/torch/__init__.py 2021-07-09 17:16:49.198840543 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -13274,7 +13256,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= \ No newline at end of file diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt --- pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/lib/c10d/CMakeLists.txt 2021-07-05 14:59:27.868347381 +0800 ++++ pytorch-develop/torch/lib/c10d/CMakeLists.txt 2021-07-09 17:16:49.250842407 +0800 @@ -28,6 +28,10 @@ option(USE_C10D_NCCL "USE C10D NCCL" ON) endif() @@ -13327,7 +13309,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= copy_header(ProcessGroupMPI.hpp) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt --- pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/lib/libshm/CMakeLists.txt 2021-07-05 14:59:27.872347411 +0800 ++++ pytorch-develop/torch/lib/libshm/CMakeLists.txt 2021-07-09 17:16:49.250842407 +0800 @@ -37,8 +37,11 @@ SET_TARGET_PROPERTIES(shm PROPERTIES PREFIX "lib" @@ -13384,7 +13366,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -_maybe_indices_t = _scalar_or_tuple_2_t[Tensor] diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py --- pytorch-v1.5.0/torch/nn/functional.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/functional.py 2021-07-05 14:59:27.872347411 +0800 ++++ pytorch-develop/torch/nn/functional.py 2021-07-09 17:16:49.254842550 +0800 @@ -1611,7 +1611,7 @@ else: output = input.matmul(weight.t()) @@ -13407,7 +13389,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -from . import parallel as parallel diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py --- pytorch-v1.5.0/torch/nn/modules/batchnorm.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/batchnorm.py 2021-07-05 14:59:27.872347411 +0800 ++++ pytorch-develop/torch/nn/modules/batchnorm.py 2021-07-09 17:16:49.254842550 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -13439,7 +13421,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= self.register_parameter('running_var', None) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py --- pytorch-v1.5.0/torch/nn/modules/module.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/module.py 2021-07-05 14:59:27.876347442 +0800 ++++ pytorch-develop/torch/nn/modules/module.py 2021-07-09 17:16:49.254842550 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -13582,7 +13564,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py --- pytorch-v1.5.0/torch/nn/modules/normalization.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/normalization.py 2021-07-05 14:59:27.876347442 +0800 ++++ pytorch-develop/torch/nn/modules/normalization.py 2021-07-09 17:16:49.254842550 +0800 @@ -128,13 +128,14 @@ """ __constants__ = ['normalized_shape', 'eps', 'elementwise_affine'] @@ -13615,7 +13597,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= return '{normalized_shape}, eps={eps}, ' \ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in --- pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/modules/transformer.pyi.in 2021-07-05 14:59:27.876347442 +0800 ++++ pytorch-develop/torch/nn/modules/transformer.pyi.in 2021-07-09 17:16:49.254842550 +0800 @@ -1,60 +1,60 @@ -from ..init import xavier_uniform_ -from .activation import MultiheadAttention @@ -13775,7 +13757,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - module_kwargs: Optional[Any] = ...) -> Tensor: ... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py --- pytorch-v1.5.0/torch/nn/parallel/distributed.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/nn/parallel/distributed.py 2021-07-05 14:59:27.876347442 +0800 ++++ pytorch-develop/torch/nn/parallel/distributed.py 2021-07-09 17:16:49.258842694 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14126,7 +14108,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= -def remove_weight_norm(module: T_module, name: str = ...) -> T_module: ... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py --- pytorch-v1.5.0/torch/onnx/symbolic_opset9.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/onnx/symbolic_opset9.py 2021-07-05 14:59:27.880347472 +0800 ++++ pytorch-develop/torch/onnx/symbolic_opset9.py 2021-07-09 17:16:49.258842694 +0800 @@ -1621,14 +1621,23 @@ slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals] return g.op('Concat', *slices, axis_i=0) @@ -14204,7 +14186,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __init__(self, params: _params_t, lr: float=..., lr_decay: float=..., weight_decay: float=..., initial_accumulator_value: float=..., eps: float=...) -> None: ... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py --- pytorch-v1.5.0/torch/optim/adamax.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/optim/adamax.py 2021-07-05 14:59:27.880347472 +0800 ++++ pytorch-develop/torch/optim/adamax.py 2021-07-09 17:16:49.262842837 +0800 @@ -80,8 +80,8 @@ exp_inf.mul_(beta2).unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0) @@ -14381,7 +14363,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=...) -> None: ... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py --- pytorch-v1.5.0/torch/serialization.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/serialization.py 2021-07-05 14:59:27.880347472 +0800 ++++ pytorch-develop/torch/serialization.py 2021-07-09 17:16:49.262842837 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14465,7 +14447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= def location_tag(storage): diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py --- pytorch-v1.5.0/torch/storage.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/storage.py 2021-07-05 14:59:27.880347472 +0800 ++++ pytorch-develop/torch/storage.py 2021-07-09 17:16:49.262842837 +0800 @@ -7,6 +7,7 @@ class _StorageBase(object): @@ -14485,7 +14467,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= else: diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py --- pytorch-v1.5.0/torch/tensor.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/tensor.py 2021-07-05 14:59:27.880347472 +0800 ++++ pytorch-develop/torch/tensor.py 2021-07-09 17:16:49.262842837 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14547,7 +14529,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= def __reversed__(self): diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py --- pytorch-v1.5.0/torch/_tensor_str.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/_tensor_str.py 2021-07-05 14:59:27.820347015 +0800 ++++ pytorch-develop/torch/_tensor_str.py 2021-07-09 17:16:49.198840543 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14601,7 +14583,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= has_default_dtype = self.dtype in (torch.get_default_dtype(), torch.int64, torch.bool) diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py --- pytorch-v1.5.0/torch/utils/data/dataloader.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/utils/data/dataloader.py 2021-07-05 14:59:27.884347503 +0800 ++++ pytorch-develop/torch/utils/data/dataloader.py 2021-07-09 17:16:49.266842980 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14810,7 +14792,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None: ... diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py --- pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/utils/data/_utils/pin_memory.py 2021-07-05 14:59:27.884347503 +0800 ++++ pytorch-develop/torch/utils/data/_utils/pin_memory.py 2021-07-09 17:16:49.266842980 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. @@ -14871,7 +14853,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= - diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py --- pytorch-v1.5.0/torch/utils/__init__.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/utils/__init__.py 2021-07-05 14:59:27.884347503 +0800 ++++ pytorch-develop/torch/utils/__init__.py 2021-07-09 17:16:49.266842980 +0800 @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals @@ -14882,7 +14864,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude= def set_module(obj, mod): diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py --- pytorch-v1.5.0/torch/_utils.py 2021-04-10 18:39:32.000000000 +0800 -+++ pytorch-develop/torch/_utils.py 2021-07-05 14:59:27.820347015 +0800 ++++ pytorch-develop/torch/_utils.py 2021-07-09 17:16:49.202840686 +0800 @@ -1,3 +1,19 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. diff --git a/src/aten/src/ATen/native/native_functions.yaml b/src/aten/src/ATen/native/native_functions.yaml index afdda6988a665d514a0694374e86b4b5c061430f..4b3a1b7ded4f60281cf4d8dffa66d024ed3f5ef8 100644 --- a/src/aten/src/ATen/native/native_functions.yaml +++ b/src/aten/src/ATen/native/native_functions.yaml @@ -2302,8 +2302,6 @@ requires_tensor: True dispatch: QuantizedCPU: quantized_max_pool2d - npu_dispatch: - NPU: quantized_max_pool2d_npu - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor supports_named_tensor: True @@ -2511,13 +2509,9 @@ - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) supports_named_tensor: True variants: function, method - npu_dispatch: - NPU: mode_npu - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) supports_named_tensor: True - npu_dispatch: - NPU: mode_out_npu - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -6169,8 +6163,6 @@ dispatch: CPU: legacy::cpu::_th_histc_out CUDA: _histc_out_cuda - npu_dispatch: - NPU: histc_out_npu - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor use_c10_dispatcher: full @@ -6178,8 +6170,6 @@ dispatch: CPU: legacy::cpu::_th_histc CUDA: _histc_cuda - npu_dispatch: - NPU: histc_npu - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -6538,7 +6528,9 @@ dispatch: CPU: legacy::cpu::_th_index_copy_ CUDA: legacy::cuda::_th_index_copy_ - + npu_dispatch: + NPU: index_copy_npu_ + - func: _cumsum(Tensor self, int dim) -> Tensor use_c10_dispatcher: full dispatch: @@ -8338,7 +8330,7 @@ npu_dispatch_only: NPU: nms_v4_npu -- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) +- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) variants: function npu_dispatch_only: NPU: lstm_npu @@ -8546,4 +8538,12 @@ - func: npu_masked_fill_range(Tensor self, Tensor start, Tensor end, Tensor value, int axis=-1) -> Tensor variants: function, method npu_dispatch_only: - NPU: masked_fill_range_npu \ No newline at end of file + NPU: masked_fill_range_npu + +- func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + npu_dispatch_only: + NPU: linear_npu + +- func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor) + npu_dispatch_only: + NPU: linear_backward_npu \ No newline at end of file diff --git a/src/aten/src/ATen/native/npu/AddKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddKernelNpu.cpp index a2c4c8301ddc9087871c8f6a9538a9553ce14fe5..4d21e8652d355f6aa3c239ea1a7c15ac82d5ec8e 100644 --- a/src/aten/src/ATen/native/npu/AddKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/AddKernelNpu.cpp @@ -47,10 +47,20 @@ Tensor& adds_out_npu_nocheck( float alphaValue = CalcuOpUtil::get_scalar_float_value(alpha); float value = otherValue * alphaValue; OpCommand cmd; + std::string real_type = ""; + if (self.scalar_type() == c10::ScalarType::Bool) { + auto unified_result = OpPreparation::binary_op_check(result, self, other, true); + if (unified_result.common_type == c10::ScalarType::Bool) { + unified_result.common_type = c10::ScalarType::Byte; + unified_result.result_type_defined = true; + real_type = "uint8"; + } + cmd.Expect(unified_result); + } cmd.Name("Add") .Input(self) .Input(value, self.scalar_type()) - .Output(result) + .Output(result, real_type) .Run(); return result; diff --git a/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp b/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp index 97bd9f8b8a0b53cb81b64e1357f3b9385be041a6..15671b097f6143c81c711f56fd54a82d7d351541 100644 --- a/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp @@ -24,11 +24,11 @@ static inline void grid_assign_positive_check( const Tensor& argmax_overlaps, const Tensor& gt_argmax_overlaps){ TORCH_CHECK( - at::isIntegralType(argmax_overlaps.scalar_type()) && argmax_overlaps.scalar_type() != ScalarType::Long, + at::isIntegralType(argmax_overlaps.scalar_type(), true) && argmax_overlaps.scalar_type() != ScalarType::Long, "int32 argmax_overlaps tensor expected but got a tensor with dtype: ", argmax_overlaps.scalar_type()); TORCH_CHECK( - at::isIntegralType(gt_argmax_overlaps.scalar_type()) && gt_argmax_overlaps.scalar_type() != ScalarType::Long, + at::isIntegralType(gt_argmax_overlaps.scalar_type(), true) && gt_argmax_overlaps.scalar_type() != ScalarType::Long, "int32 gt_argmax_overlaps tensor expected but got a tensor with dtype: ", gt_argmax_overlaps.scalar_type()); } diff --git a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp index b9d0bfe9247169441b616fd38a1cb451cf6aa41b..63970a9d50ce6ce8a50b284dca5d047d11f910a0 100644 --- a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp @@ -23,11 +23,19 @@ using namespace at::native::npu; Tensor& gt_out_npu_nocheck(Tensor& result, const Tensor& self, const Tensor& other) { auto unified_result = OpPreparation::comparison_op_check(result, self, other, true); + + Tensor selfCast = self; + Tensor otherCast = other; + if(self.dtype() == ScalarType::Bool || other.dtype() == ScalarType::Bool){ + selfCast = self.to(ScalarType::Float); + otherCast = other.to(ScalarType::Float); + } + OpCommand cmd; cmd.Name("Greater") .Expect(unified_result) - .Input(self) - .Input(other) + .Input(selfCast) + .Input(otherCast) .Output(result) .Run(); @@ -51,10 +59,15 @@ Tensor& gt_out_npu(Tensor& result, const Tensor& self, const Tensor& other) { } Tensor& gt_out_npu_nocheck(Tensor& result, const Tensor& self, Scalar other) { + Tensor selfCast = self; + if(self.dtype() == ScalarType::Bool){ + selfCast = self.to(ScalarType::Float); + } + OpCommand cmd; cmd.Name("Greater") - .Input(self) - .Input(other, self.scalar_type()) + .Input(selfCast) + .Input(other, selfCast.scalar_type()) .Output(result) .Run(); diff --git a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp index d658eec1deb854ba4fced8595f3c8e9648b1bcef..346d41155fcb41e0b1396267dc8190b1e009ab16 100644 --- a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp @@ -72,15 +72,16 @@ Tensor& im2col_out_npu_nocheck(Tensor& result, const Tensor &self, IntArrayRef k TORCH_CHECK(padding.empty() || padding.size() == 1 || padding.size() == 2, "im2col: padding must either be omitted, a single int, or a tuple of two ints"); - padding = padding.empty() ? IntArrayRef({0}) : padding; - if (padding.size() == 1) { - SmallVector pads = {padding[0], padding[0], padding[0], padding[0]}; - padding = IntArrayRef(pads); - } else if (padding.size() == 2) { - SmallVector pads = {padding[0], padding[0], padding[1], padding[1]}; - padding = IntArrayRef(pads); + auto padding_ = padding.empty() ? IntArrayRef({0}) : padding; + SmallVector pads; + if (padding_.size() == 1) { + pads = {padding_[0], padding_[0], padding_[0], padding_[0]}; + } else if (padding_.size() == 2) { + pads = {padding_[0], padding_[0], padding_[1], padding_[1]}; } + auto padding_4d = IntArrayRef(pads); + int64_t strideH = 1; int64_t strideW = 1; if (stride.size() == 1) { @@ -100,10 +101,11 @@ Tensor& im2col_out_npu_nocheck(Tensor& result, const Tensor &self, IntArrayRef k dilationH = dilation[0]; dilationW = dilation[1]; } + SmallVector kernelSize = {kernel_size[0], kernel_size[1]}; SmallVector stridesSize = {strideH, strideW}; SmallVector dilationsSize = {dilationH, dilationW}; - SmallVector padsSize = {padding[0], padding[1], padding[2], padding[3]}; + SmallVector padsSize = {padding_4d[0], padding_4d[1], padding_4d[2], padding_4d[3]}; string padding_mode = "CALCULATED"; OpCommand cmd; @@ -135,12 +137,10 @@ Tensor& im2col_out_npu(Tensor& result, const Tensor &self, IntArrayRef kernel_si Tensor im2col_npu(const Tensor &self, IntArrayRef kernel_size, IntArrayRef dilation, IntArrayRef padding, IntArrayRef stride) { - // calculate the output size auto outputSize = image_to_col_npu_output_size(self, kernel_size, stride, dilation, padding); Tensor result = OpPreparation::ApplyTensor(self, outputSize); im2col_out_npu(result, self, kernel_size, dilation, padding, stride); - return result; } diff --git a/src/aten/src/ATen/native/npu/IndexCopyKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexCopyKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c0fe1fa25dc4f793e59d084d7792c30641bd432 --- /dev/null +++ b/src/aten/src/ATen/native/npu/IndexCopyKernelNpu.cpp @@ -0,0 +1,138 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "ATen/native/npu/utils/OpAdapter.h" +#include + +namespace at { +namespace native { +using namespace at::native::npu; + +void index_copy_npu_par_check( + const int64_t dim, + const Tensor& index, + const Tensor& source, + const Tensor& result) { + int64_t newDim = maybe_wrap_dim(dim, result.dim()); + TORCH_CHECK_INDEX(index.dim() < 2, "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")"); + + int64_t numIndices = index.numel(); + TORCH_CHECK_INDEX(!(source.dim() == 0 && numIndices != 1), + "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")"); + TORCH_CHECK_INDEX(!((source.dim() != result.dim()) && (source.dim() != 0 && result.dim() != 0)), + "index_copy_(): When source and destination are not scalars, \ +their dimensionality must match. Source dimensionality (", + source.dim(), "), destination dimensionality (", result.dim(), ")"); + + TORCH_CHECK_INDEX(index.scalar_type() == ScalarType::Long, "index_copy_(): Expected LongTensor for index"); + + // Check that source and destination slices have the same size + auto selfSlicedSizes = result.sizes().vec(); + if (selfSlicedSizes.size() > 0) { + selfSlicedSizes.erase(selfSlicedSizes.begin() + newDim); + } + auto sourceSlicedSizes = source.sizes().vec(); + if (sourceSlicedSizes.size() > 0) { + sourceSlicedSizes.erase(sourceSlicedSizes.begin() + newDim); + } + if (selfSlicedSizes.size() != sourceSlicedSizes.size() || + !std::equal(selfSlicedSizes.begin(), selfSlicedSizes.end(), + sourceSlicedSizes.begin())) { + std::stringstream ss; + ss << "index_copy_(): Source/destination tensor must have same slice shapes. "; + ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << newDim; + ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0."; + TORCH_CHECK(false, ss.str()); + } + TORCH_CHECK_INDEX(source.dim() == 0 || numIndices == source.size(newDim), + "index_copy_(): Number of indices (", numIndices, + ") should be equal to source.size(newDim) (", source.size(newDim), ")"); +} + +Tensor& index_copy_npu_impl( + const int64_t dim, + const Tensor& index, + const Tensor& source, + Tensor& result) { + index_copy_npu_par_check(dim, index, source, result); + int64_t numIndices = index.numel(); + int64_t i; + if (result.dim() > 1) { + Tensor des; + Tensor src; + for (i = 0; i < numIndices; i++) { + des = at::native::select(result, dim, index[i].item()); + src = at::native::select(source, dim, i); + at::native::copy_npu_(des, src); + } + } else { + for (i = 0; i < numIndices; i++) { + result[i] = source[index[i].item()]; + } + } + return result; +} + +Tensor index_copy_npu( + const Tensor& self, + const int64_t dim, + const Tensor& index, + const Tensor& source) { + Tensor result(self.clone()); + return index_copy_npu_impl(dim, index, source, result); + +} + +Tensor index_copy_npu( + const Tensor& self, + const Dimname dim, + const Tensor& index, + const Tensor& source) { + Tensor result(self.clone()); + return index_copy_npu_impl(dimname_to_position(self, dim), index, source, result); +} + +Tensor& index_copy_npu_( + Tensor& self, + const int64_t dim, + const Tensor& index, + const Tensor& source) { + Tensor contiguousSelf(self); + if (!NpuUtils::check_match(&self)) { + contiguousSelf = NpuUtils::format_contiguous(self); + } + Tensor result = index_copy_npu_impl(dim, index, source, contiguousSelf); + NpuUtils::format_fresh_view(self, result); + + return self; +} + +Tensor& index_copy_npu_( + Tensor& self, + const Dimname dim, + const Tensor& index, + const Tensor& source) { + Tensor contiguousSelf(self); + if (!NpuUtils::check_match(&self)) { + contiguousSelf = NpuUtils::format_contiguous(self); + } + Tensor result = index_copy_npu_impl(dimname_to_position(self, dim), index, source, contiguousSelf); + NpuUtils::format_fresh_view(self, result); + + return self; +} + +} // namespace native +} // namespace at diff --git a/src/aten/src/ATen/native/npu/HistcKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp similarity index 38% rename from src/aten/src/ATen/native/npu/HistcKernelNpu.cpp rename to src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp index 61624a31693ca083e6f3eb3f0e0ed19ddc405b90..492007773216401faeafdadc57ea23b482ea4e59 100644 --- a/src/aten/src/ATen/native/npu/HistcKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp @@ -14,65 +14,48 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" -#include "ATen/native/npu/utils/OpTemplate.h" +#include "ATen/native/npu/utils/OpAdapter.h" namespace at { namespace native { using namespace at::native::npu; -#define FLT_EPSILON 1.19209290E-07F - -bool is_zero(float x) -{ - if(x > -FLT_EPSILON && x < FLT_EPSILON){ - return true; - } - else{ - return false; - } -} - -Tensor& histc_out_npu( +Tensor linear_backward_out_npu( Tensor& result, - const Tensor& self, - int64_t bins, - Scalar min, - Scalar max) { + const Tensor& input, + const Tensor& weight, + bool transpose_x1, + bool transpose_x2) { + int64_t offset_x = 0; OpCommand cmd; - float max_value = CalcuOpUtil::get_scalar_float_value(max); - float min_value = CalcuOpUtil::get_scalar_float_value(min); - - if(max_value == min_value && is_zero(max_value)){ - // Execute reduce_max_d and reduce_min_d to get the min and max value - Tensor res_max = at::max(self); - Tensor res_min = at::min(self); - - max_value = CalcuOpUtil::get_scalar_float_value(res_max.item()); - min_value = CalcuOpUtil::get_scalar_float_value(res_min.item()); - } - cmd.Name("HistogramD") - .Input(self) - .Attr("bins", bins) - .Attr("min", min_value) - .Attr("max", max_value) + cmd.Name("MatMulV2") + .Input(input) + .Input(weight) .Output(result) + .Attr("transpose_x1", transpose_x1) + .Attr("transpose_x2", transpose_x2) + .Attr("offset_x", offset_x) .Run(); - return result; } -Tensor histc_npu(const Tensor& self, int64_t bins, Scalar min, Scalar max) { - // construct the output tensor of the NPU - Tensor result = at::empty_with_format( - {bins}, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - // calculate the output result of the NPU - histc_out_npu(result, self, bins, min, max); - - return result; +tuple linear_backward_npu( + const Tensor& grad, + const Tensor& input, + const Tensor& weight) { + SmallVector inputGradOutputSize = { + grad.size(0), + weight.size(1)}; + SmallVector weightGradOutputSize = { + grad.size(1), + input.size(1)}; + Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize); + Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize); + + linear_backward_out_npu(inputGrad, grad, weight, false, false); + linear_backward_out_npu(weightGrad, grad, input, true, false); + + return std::tie(inputGrad, weightGrad); } } // namespace native diff --git a/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f35c5de9fa2450586a10e89843f7d653b99207ba --- /dev/null +++ b/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp @@ -0,0 +1,48 @@ +// Copyright (c) 2020 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ATen/native/npu/utils/OpAdapter.h" + +namespace at { +namespace native { +using namespace at::native::npu; + +Tensor linear_npu( + const Tensor& input, + const Tensor& weight, + const Tensor& bias) { + SmallVector outputSize = {input.size(0), weight.size(0)}; + Tensor output = OpPreparation::ApplyTensor(input, outputSize); + + int64_t offset_x = 0; + OpCommand cmd; + cmd.Name("MatMulV2") + .Input(input) + .Input(weight); + if (bias.defined()) { + cmd.Input(bias); + } + cmd.Output(output) + .Attr("transpose_x1", false) + .Attr("transpose_x2", true) + .Attr("offset_x", offset_x) + .Run(); + + return output; +} + +} // namespace native +} // namespace at diff --git a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp index c158f0309e2672a911e3144e7e302fbce01bc82b..d526dd2b568ab939801da6cda83e4f0d81ee01ec 100644 --- a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp @@ -25,6 +25,7 @@ tuple lstm_npu( const Tensor& input, const Tensor& weight, const Tensor& bias, + const Tensor& seqMask, const Tensor& h, const Tensor& c, bool has_biases, @@ -32,7 +33,9 @@ tuple lstm_npu( double dropout, bool train, bool bidirectional, - bool batch_first) { + bool batch_first, + bool flagSeq, + bool flagDirection) { // calculate the output size int64_t numStep = input.size(0); int64_t batchSize = input.size(1); @@ -49,41 +52,83 @@ tuple lstm_npu( Tensor fOutput = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_FRACTAL_NZ); Tensor oOutput = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_FRACTAL_NZ); Tensor tanhc = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_FRACTAL_NZ); - + + string direction = flagDirection? "REDIRECTIONAL" : "UNIDIRECTIONAL"; OpCommand cmd; cmd.Name("DynamicRNN") - .Input(input) - .Input(weight) - .Input(bias) - .Input() - .Input(h) - .Input(c) - .Output(yOutput) - .Output(hOutput) - .Output(cOutput) - .Output(iOutput) - .Output(jOutput) - .Output(fOutput) - .Output(oOutput) - .Output(tanhc) - .Attr("cell_type", (string)"LSTM") - .Attr("direction", (string)"UNIDIRECTIONAL") - .Attr("cell_depth", (int64_t)1) - .Attr("use_peephole", (bool)false) - .Attr("keep_prob", (float)1.0) - .Attr("cell_clip", (float)-1.0) - .Attr("num_proj", (int64_t)0) - .Attr("time_major", (bool)true) - .Attr("activation", (string)"tanh") - .Attr("forget_bias", (float)0.0) - .Attr("is_training", train) - .Run(); + .Input(input, "x") + .Input(weight, "w") + .Input(bias, "b"); + + //if input is PackSequence, seqMask is not None, Otherwise, it is None. + if (!flagSeq){ + cmd.Input(); + } else{ + cmd.Input(seqMask, "seq_length"); + } + cmd.Input(h, "init_h") + .Input(c, "init_c") + .Output(yOutput) + .Output(hOutput) + .Output(cOutput) + .Output(iOutput) + .Output(jOutput) + .Output(fOutput) + .Output(oOutput) + .Output(tanhc) + .Attr("cell_type", (string)"LSTM") + .Attr("direction", direction) + .Attr("cell_depth", (int64_t)1) + .Attr("use_peephole", (bool)false) + .Attr("keep_prob", (float)1.0) + .Attr("cell_clip", (float)-1.0) + .Attr("num_proj", (int64_t)0) + .Attr("time_major", (bool)true) + .Attr("activation", (string)"tanh") + .Attr("forget_bias", (float)0.0) + .Attr("is_training", train) + .Run(); + //std::cout<<"yOutput: "<( - yOutput, hOutput, cOutput, iOutput, jOutput, fOutput, oOutput, tanhc); + yOutput, hOutput, cOutput, iOutput, jOutput, fOutput, oOutput, tanhc); } -tuple lstm_npu( +tuple get_wb_single_layer_direc( + const Tensor& input, + TensorList params, + bool hasBiases) { + // get weight + Tensor ihWeight = params[0]; + Tensor hhWeight = params[1]; + + Tensor weight = at::cat({ihWeight, hhWeight}, 1).t().to(input.dtype()); + + // get bias + Tensor bias = at::zeros(weight.size(1), weight.options()); + if (hasBiases) { + bias = at::add(params[2], params[3]).to(input.dtype()); + } + return std::tie(weight, bias); +} + +tuple get_wb_double_layer_or_bidirec( + const Tensor& input, + TensorList params, + bool hasBiases) { + Tensor weight; + Tensor bias; + if (hasBiases) { + weight = at::cat({params[4], params[5]}, 1).t().to(input.dtype()); + bias = at::add(params[6], params[7]).to(input.dtype()); + } else { + weight = at::cat({params[2], params[3]}, 1).t().to(input.dtype()); + bias = at::zeros(weight.size(1), weight.options()); + } + return std::tie(weight, bias); +} + +tuple lstm_single_layer_direc_npu( const Tensor& input, TensorList hx, TensorList params, @@ -92,7 +137,8 @@ tuple lstm_npu( double dropout, bool train, bool bidirectional, - bool batchFirst) { + bool batchFirst, + bool direction) { int64_t numStep = input.size(0); // get weight @@ -110,51 +156,169 @@ tuple lstm_npu( // get init_h, init_c Tensor h = hx[0]; Tensor c = hx[1]; - if(numLayers == 2) - { - h = hx[0].slice(0, 0, 1); - c = hx[1].slice(0, 0, 1); - } - - auto results = at::npu_lstm( - input, weight, bias, h, c, hasBiases, numLayers, dropout, train, bidirectional, batchFirst); + + Tensor seqMask = at::empty({0}, input.options()); + auto results = at::npu_lstm(input, weight, bias, seqMask, h, c, hasBiases, numLayers, dropout, + train, bidirectional, batchFirst, false, direction); // get the last dimension of the T-axis Tensor thOutput = at::unsqueeze(std::get<1>(results)[numStep-1], 0); Tensor tcOutput = at::unsqueeze(std::get<2>(results)[numStep-1], 0); - - //double layer LSTM - if (numLayers == 2) { + + return std::tie(std::get<0>(results), thOutput, tcOutput); +} + +tuple lstm_single_layer_bidirec_npu( + const Tensor& input, + TensorList hx, + TensorList params, + bool hasBiases, + int64_t numLayers, + double dropout, + bool train, + bool bidirectional, + bool batchFirst) { + int64_t numStep = input.size(0); + //get h and c of forward direction + Tensor h = hx[0].slice(0, 0, 1); + Tensor c = hx[1].slice(0, 0, 1); + + //caculate forward direction, direction of attr is UNIDIRECTIONAL(npu_lstm need add the attr of direction) + auto resultsForward = lstm_single_layer_direc_npu(input, {h, c}, params, hasBiases, + numLayers, dropout, train, bidirectional, batchFirst, false); + + //get w/ b/ h/ c of backward direction + Tensor weightBack; + Tensor biasBack; + Tensor hBack = hx[0].slice(0, 1, 2); + Tensor cBack = hx[1].slice(0, 1, 2); + std::tie(weightBack, biasBack) = get_wb_double_layer_or_bidirec(input, params, hasBiases); + + Tensor seqMask = at::empty({0}, input.options()); + //caculate forward direction, direction of attr is REDIRECTIONAL + auto resultsBackward = at::npu_lstm(input, weightBack, biasBack, seqMask, hBack, cBack, + hasBiases, numLayers, dropout, train, bidirectional, batchFirst, false, true); + + // get the first dimension of the T-axis when caculate reverse direction + Tensor thOutput = at::unsqueeze(std::get<1>(resultsBackward)[0], 0); + Tensor tcOutput = at::unsqueeze(std::get<2>(resultsBackward)[0], 0); + + Tensor y = at::cat({std::get<0>(resultsForward), std::get<0>(resultsBackward)}, 2); + Tensor hOut = at::cat({std::get<1>(resultsForward), thOutput}, 0); + Tensor cOut = at::cat({std::get<2>(resultsForward), tcOutput}, 0); + + return std::tie(y, hOut, cOut); +} + +tuple lstm_double_layer_direc_npu( + const Tensor& input, + TensorList hx, + TensorList params, + bool hasBiases, + int64_t numLayers, + double dropout, + bool train, + bool bidirectional, + bool batchFirst) { + int64_t numStep = input.size(0); + //get h and c of first layer + Tensor h = hx[0].slice(0, 0, 1); + Tensor c = hx[1].slice(0, 0, 1); + + //caculate first layer + auto results = lstm_single_layer_direc_npu(input, {h, c}, params, hasBiases, + numLayers, dropout, train, bidirectional, batchFirst, false); + + //get w/ b/ h/ c of twice layer Tensor weight2Layer; Tensor bias2Layer; Tensor h2layer = hx[0].slice(0, 1, 2); Tensor c2layer = hx[1].slice(0, 1, 2); - if (hasBiases) { - weight2Layer = at::cat({params[4], params[5]}, 1).t().to(input.dtype()); - bias2Layer = at::add(params[6], params[7]).to(input.dtype()); - } else { - weight2Layer = at::cat({params[2], params[3]}, 1).t().to(input.dtype()); - bias2Layer = at::zeros(weight2Layer.size(1), weight2Layer.options()); - } + std::tie(weight2Layer, bias2Layer) = get_wb_double_layer_or_bidirec(input, params, hasBiases); //output of first layer as input of second layer Tensor input2Layer = std::get<0>(results); + Tensor seqMask = at::empty({0}, input.options()); //caculate output of second layer - auto results2Layer = at::npu_lstm(input2Layer, weight2Layer, bias2Layer, h2layer, c2layer, - hasBiases, numLayers, dropout, train, bidirectional, batchFirst); + auto results2Layer = at::npu_lstm(input2Layer, weight2Layer, bias2Layer, seqMask, h2layer, c2layer, + hasBiases, numLayers, dropout, train, bidirectional, batchFirst, false, false); Tensor thOutput2Layer = at::unsqueeze(std::get<1>(results2Layer)[numStep-1], 0); Tensor tcOutput2Layer = at::unsqueeze(std::get<2>(results2Layer)[numStep-1], 0); - Tensor th = at::cat({thOutput, thOutput2Layer}, 0); - Tensor tc = at::cat({tcOutput, tcOutput2Layer}, 0); + Tensor th = at::cat({std::get<1>(results), thOutput2Layer}, 0); + Tensor tc = at::cat({std::get<2>(results), tcOutput2Layer}, 0); - return std::tie(std::get<0>(results2Layer), th, tc); + return std::tie(std::get<0>(results2Layer), th, tc); +} + +tuple lstm_npu( + const Tensor& _input, + TensorList hx, + TensorList params, + bool hasBiases, + int64_t numLayers, + double dropout, + bool train, + bool bidirectional, + bool batchFirst) { + //The operator of DynamicRnn only supports the T axis as the first axis. + auto input = batchFirst ? _input.transpose(0, 1) : _input; + + Tensor y; + Tensor h; + Tensor c; + //single layer + if(numLayers == 1){ + if(!bidirectional){ + std::tie(y, h, c) = lstm_single_layer_direc_npu(input, hx, params, hasBiases, numLayers, + dropout, train, bidirectional, batchFirst, false); + } else { + std::tie(y, h, c) = lstm_single_layer_bidirec_npu(input, hx, params, hasBiases, numLayers, + dropout, train, bidirectional, batchFirst); + } + } + + //double layer + if((numLayers == 2) && (!bidirectional)) { + std::tie(y, h, c) = lstm_double_layer_direc_npu(input, hx, params, hasBiases, numLayers, + dropout, train, bidirectional, batchFirst); + } + return std::tie(y, h, c); +} + +Tensor get_mask(const Tensor& input, const Tensor& batchSizes, const Tensor& h, int64_t maxLen){ + //caculate lengths, but input expected to be sorted + std::vector lens; + for (int64_t i = 0; i < input.size(1); ++i){ + auto batchSizesTemp = at::sub(batchSizes , i); + auto batchSizesBool = at::gt(batchSizesTemp, 0); + auto batchSizesInt = batchSizesBool.to(ScalarType::Int); + auto coutLen = at::sum(batchSizesInt, ScalarType::Int); + int64_t len = coutLen.item().toInt(); + lens.emplace_back(len); } + Tensor length = CalcuOpUtil::copy_tensor_host_to_device( + from_blob(lens.data(), {lens.size()}, at::kLong)); - return std::tie(std::get<0>(results), thOutput, tcOutput); + SmallVector maskList; + //Slice by T axis + for (int64_t i = 0; i < maxLen; ++i) { + //cacl mask + Tensor maskTemp1 = at::gt(length, i); + Tensor maskTemp2 = maskTemp1.reshape({1, input.size(1), 1}); + + //mask need to be expanded to (1,batch_size,hidden_size) + Tensor maskExpand = maskTemp2.expand({1, input.size(1), h.size(2)}); + maskList.emplace_back(maskExpand); + } + + //mask mast be half + Tensor mask = at::cat(maskList, 0).to(ScalarType::Half); + + return mask; } -std::tuple lstm_npu( +std::tuple lstm_onelayer_direc_packseq( const Tensor& data, const Tensor& batchSizes, TensorList hx, TensorList params, bool hasBiases, int64_t numLayers, double dropoutP, bool train, bool bidirectional) { @@ -170,53 +334,153 @@ std::tuple lstm_npu( // get init_h, init_c Tensor h = hx[0]; Tensor c = hx[1]; + + int64_t numStep = input.size(0); + + // get weight + Tensor ihWeight = params[0]; + Tensor hhWeight = params[1]; + Tensor weight = at::cat({ihWeight, hhWeight}, 1).t().to(input.dtype()); + + // get bias + Tensor bias = at::zeros(weight.size(1), weight.options()); + if (hasBiases) { + bias = at::add(params[2], params[3]).to(input.dtype()); + } int64_t maxLen = input.size(0); - std::vector outputs; - std::vector hxPrev = {h, c}; - //caculate lengths, but input expected to be sorted - std::vector lens; - for (int64_t i = 0; i < input.size(1); ++i){ - auto batchSizesTemp = at::sub(batchSizes , i); - auto batchSizesBool = at::gt(batchSizesTemp, 0); - auto batchSizesInt = batchSizesBool.to(ScalarType::Int); - auto coutLen = at::sum(batchSizesInt, ScalarType::Int); - int64_t len = coutLen.item().toInt(); - lens.emplace_back(len); - } - Tensor length = CalcuOpUtil::copy_tensor_host_to_device( - from_blob(lens.data(), {lens.size()}, at::kLong)); - - //Slice by T axis - for (int64_t i = 0; i < maxLen; ++i) { - Tensor step = input.slice(0, i, i + 1).contiguous().reshape({1, input.size(1), input.size(2)}); + Tensor mask = get_mask(input, batchSizes, h, maxLen); + auto results = at::npu_lstm(input, weight, bias, mask, h, c, hasBiases, numLayers, + dropoutP, train, bidirectional, false, true, false); - //calculate output of each times - auto results = lstm_npu(step, hxPrev, params, hasBiases, numLayers, dropoutP, train, bidirectional, batchFirst); - - //get previous result - Tensor outputTemp = std::get<0>(results); - std::vector hxCurr = {std::get<1>(results), std::get<2>(results)}; + Tensor thOutput = at::unsqueeze(std::get<1>(results)[numStep-1], 0); + Tensor tcOutput = at::unsqueeze(std::get<2>(results)[numStep-1], 0); + + return std::tuple(std::get<0>(results), thOutput, tcOutput); +} - //cacl mask - Tensor maskTemp = at::gt(length, i); - Tensor mask = maskTemp.reshape({1, input.size(1), 1}); +std::tuple lstm_onelayer_bidirec_packseq( + const Tensor& data, const Tensor& batchSizes, TensorList hx, + TensorList params, bool hasBiases, + int64_t numLayers, double dropoutP, bool train, bool bidirectional) { + //length of T axis + int64_t t_size = batchSizes.numel(); + + //T * B ** + Tensor input = data.reshape({t_size, data.size(0)/t_size, data.size(1)}); + + // batch_first is false + bool batchFirst = false; + + //get h and c of forward direction + Tensor h = hx[0].slice(0, 0, 1); + Tensor c = hx[1].slice(0, 0, 1); - //calculate real output of each times - Tensor maskNeg = at::logical_not(mask); - Tensor output = at::mul(outputTemp, mask); + auto resultsForward = lstm_onelayer_direc_packseq(data, batchSizes, {h, c}, params, hasBiases, + numLayers, dropoutP, train, bidirectional); - //updata hx - h = at::mul(mask, hxCurr[0]) + at::mul(maskNeg, hxPrev[0]); - c = at::mul(mask, hxCurr[1]) + at::mul(maskNeg, hxPrev[1]); - hxPrev = {h, c}; + //get w/ b/ h/ c of backward direction + Tensor hBack = hx[0].slice(0, 1, 2); + Tensor cBack = hx[1].slice(0, 1, 2); + + Tensor weightBack; + Tensor biasBack; + std::tie(weightBack, biasBack) = get_wb_double_layer_or_bidirec(input, params, hasBiases); + + int64_t maxLen = input.size(0); + + Tensor mask = get_mask(input, batchSizes, h, maxLen); + //caculate forward direction, direction of attr is REDIRECTIONAL + auto resultsBackward = at::npu_lstm(input, weightBack, biasBack, mask, hBack, cBack, + hasBiases, numLayers, dropoutP, train, bidirectional, batchFirst, true, true); + + // get the first dimension of the T-axis when caculate reverse direction + Tensor thOutput = at::unsqueeze(std::get<1>(resultsBackward)[0], 0); + Tensor tcOutput = at::unsqueeze(std::get<2>(resultsBackward)[0], 0); + + Tensor y = at::cat({std::get<0>(resultsForward), std::get<0>(resultsBackward)}, 2); + Tensor hOut = at::cat({std::get<1>(resultsForward), thOutput}, 0); + Tensor cOut = at::cat({std::get<2>(resultsForward), tcOutput}, 0); - outputs.push_back(output); - } - Tensor result = at::cat(outputs, 0); + return std::tie(y, hOut, cOut); +} + +std::tuple lstm_double_layer_direc_packseq( + const Tensor& data, const Tensor& batchSizes, TensorList hx, + TensorList params, bool hasBiases, + int64_t numLayers, double dropoutP, bool train, bool bidirectional) { + //length of T axis + int64_t t_size = batchSizes.numel(); - return std::tie(result, h, c); + //T * B ** + Tensor input = data.reshape({t_size, data.size(0)/t_size, data.size(1)}); + + // batch_first is false + bool batchFirst = false; + + //get h and c of forward direction + Tensor h = hx[0].slice(0, 0, 1); + Tensor c = hx[1].slice(0, 0, 1); + + int64_t numStep = input.size(0); + + auto results = lstm_onelayer_direc_packseq(data, batchSizes, {h, c}, params, hasBiases, + numLayers, dropoutP, train, bidirectional); + + //get w/ b/ h/ c of twice layer + Tensor weight2Layer; + Tensor bias2Layer; + Tensor h2layer = hx[0].slice(0, 1, 2); + Tensor c2layer = hx[1].slice(0, 1, 2); + std::tie(weight2Layer, bias2Layer) = get_wb_double_layer_or_bidirec(input, params, hasBiases); + + int64_t maxLen = input.size(0); + + Tensor mask = get_mask(input, batchSizes, h, maxLen); + + //output of first layer as input of second layer + Tensor input2Layer = std::get<0>(results); + + //caculate output of second layer + auto results2Layer = at::npu_lstm(input2Layer, weight2Layer, bias2Layer, mask, h2layer, c2layer, + hasBiases, numLayers, dropoutP, train, bidirectional, batchFirst, true, false); + Tensor thOutput2Layer = at::unsqueeze(std::get<1>(results2Layer)[numStep-1], 0); + Tensor tcOutput2Layer = at::unsqueeze(std::get<2>(results2Layer)[numStep-1], 0); + Tensor th = at::cat({std::get<1>(results), thOutput2Layer}, 0); + Tensor tc = at::cat({std::get<2>(results), tcOutput2Layer}, 0); + + return std::tie(std::get<0>(results2Layer), th, tc); +} + +std::tuple lstm_npu( + const Tensor& data, const Tensor& batchSizes, TensorList hx, + TensorList params, bool hasBiases, + int64_t numLayers, double dropoutP, bool train, bool bidirectional) { + Tensor y; + Tensor h; + Tensor c; + + // batch_first is false + bool batchFirst = false; + + //single layer + if(numLayers == 1){ + if(!bidirectional){ + std::tie(y, h, c) = lstm_onelayer_direc_packseq(data, batchSizes, hx, params, hasBiases, + numLayers, dropoutP, train, bidirectional); + } else { + std::tie(y, h, c) = lstm_onelayer_bidirec_packseq(data, batchSizes, hx, params, hasBiases, + numLayers, dropoutP, train, bidirectional); + } + } + + //double layer + if((numLayers == 2) && (!bidirectional)) { + std::tie(y, h, c) = lstm_double_layer_direc_packseq(data, batchSizes, hx, params, hasBiases, + numLayers, dropoutP, train, bidirectional); + } + return std::tie(y, h, c); } } // namespace native diff --git a/src/aten/src/ATen/native/npu/ModeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ModeKernelNpu.cpp deleted file mode 100644 index a1a771322f09ac13f51b25d53e85eb821682a7e9..0000000000000000000000000000000000000000 --- a/src/aten/src/ATen/native/npu/ModeKernelNpu.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2020 Huawei Technologies Co., Ltd -// Copyright (c) 2019, Facebook CORPORATION. -// All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" - -namespace at { -namespace native { -using namespace at::native::npu; - -SmallVector mode_npu_output_size( - const Tensor& self, - int64_t dim, - bool keepdim) { - SmallVector outputSize; - if(dim==0){ - outputSize={self.size(1)}; - }; - if(dim==-1 || dim==1){ - outputSize={self.size(0)}; - }; - return outputSize; -} - -SmallVector mode_npu_input( - const SmallVector& inputTensor) { - return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor); -} - -SmallVector mode_npu_output( - const SmallVector& outputTensor) { - auto outputs = CalcuOpUtil::create_npu_output_tensor_desc(outputTensor); - - string indicesRealType = "int64"; - outputs[outputs.size() - 1].realDataType = indicesRealType; - return outputs; -} - -SmallVector mode_npu_attr( - int64_t dim, bool keepdim) { - NPUAttrDesc npuAttrDim = NPUAttrDesc("dim", dim); - NPUAttrDesc npuAttrKeepdim = NPUAttrDesc("keepdim", keepdim); - - SmallVector attrs = {npuAttrDim, - npuAttrKeepdim}; - return attrs; -} - - -tuple mode_out_npu( - Tensor& values, - Tensor& indices, - const Tensor& self, - int64_t dim, - bool keepdim) { - // constructs the input and output NPUTensorDesc - auto inputs = mode_npu_input({self}); - auto outputs = mode_npu_output({values, indices}); - - // constructs the attr of the NPUAttrDesc - auto attrs = mode_npu_attr(dim,keepdim); - - // executing the NPU operator - CalcuOpUtil::execute_npu_operate( - "Mode", inputs, outputs, attrs); - - return tuple(values, indices); -} - -tuple _mode_out_npu( - Tensor& values, - Tensor& indices, - const Tensor& self, - int64_t dim, - bool keepdim) { - - return mode_out_npu(values,indices,self, dim, keepdim); -} - -tuple mode_npu( - const Tensor& self, - int64_t dim, - bool keepdim -) { - // calculate the output size - auto outputSize = mode_npu_output_size(self, dim,keepdim); - - // construct the output tensor of the NPU - Tensor values= at::empty_with_format( - outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self)); - - Tensor indices = at::empty_with_format( - outputSize, self.options().dtype(at::kLong), CalcuOpUtil::get_tensor_npu_format(self)); - - - // calculate the output result of the NPU - mode_out_npu( - values, indices, self, dim,keepdim); - return tuple(values, indices); - -} - -tuple _mode_npu( - const Tensor& self, - int64_t dim, - bool keepdim -) { - return mode_npu(self,dim, keepdim); - } - -} // namespace native -} // namespace at \ No newline at end of file diff --git a/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp index ae5b445928d4be0289ec7b77c0387ec1a1ccc62e..5d0a319d9e6cecb1914424eb873c5919059b42e8 100644 --- a/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp @@ -37,7 +37,7 @@ static inline void normalize_batch_check( "self num ", self.size(0)); TORCH_CHECK( - 1 >= normalize_type >= 0, + normalize_type >= 0 && normalize_type <= 1, "normalize_type expected to be in range [0, 1], but got ", normalize_type); } diff --git a/src/aten/src/ATen/native/npu/QuantizedMaxPool2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/QuantizedMaxPool2dKernelNpu.cpp deleted file mode 100644 index f85580541670bc9b139dc2f5cb7308af4e85dc90..0000000000000000000000000000000000000000 --- a/src/aten/src/ATen/native/npu/QuantizedMaxPool2dKernelNpu.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2020, Huawei Technologies.All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ATen/native/npu/utils/CalcuOpUtil.h" -#include "ATen/native/npu/utils/KernelNpuOutputSize.h" -#include "ATen/native/npu/utils/NpuUtils.h" - -namespace at { -namespace native { -using namespace at::native::npu; - -SmallVector quantized_max_pool2d_npu_input( - const SmallVector& inputTensor) { - return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor); -} - -SmallVector quantized_max_pool2d_npu_output( - const SmallVector& outputTensor) { - auto outputs = CalcuOpUtil::create_npu_output_tensor_desc(outputTensor); - - return outputs; -} - -SmallVector quantized_max_pool2d_npu_attr( - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode) { - int64_t strideH = 1; - int64_t strideW = 1; - if (stride.empty()) { - strideH = kernel_size[0]; - strideW = kernel_size[1]; - } else { - strideH = stride[0]; - strideW = stride[1]; - } - - SmallVector kernelSize_t = {kernel_size[0], kernel_size[1]}; - SmallVector strides_t = {strideH, strideW}; - SmallVector paddings_t = {padding[0], padding[0], padding[1], padding[1]}; - SmallVector dilations_t = {dilation[0], dilation[0], dilation[1], dilation[1]}; - - IntArrayRef kernelSize = IntArrayRef(kernelSize_t); - IntArrayRef strides = IntArrayRef(strides_t); - IntArrayRef paddings = IntArrayRef(paddings_t); - IntArrayRef dilations = IntArrayRef(dilations_t); - NPUAttrDesc npuAttrKsize = NPUAttrDesc("window", kernelSize); - NPUAttrDesc npuAttrStrides = NPUAttrDesc("stride", strides); - NPUAttrDesc npuAttrMode = NPUAttrDesc("mode", (int64_t) 0); - NPUAttrDesc npuAttrPadding = NPUAttrDesc("pad", paddings); - NPUAttrDesc npuAttrDilation = NPUAttrDesc("dilation", dilations); - NPUAttrDesc npuAttrGlobalPooling = NPUAttrDesc("global_pooling", false); - NPUAttrDesc npuAttrCeilmode = NPUAttrDesc("ceil_mode", (int64_t) !ceil_mode); - - SmallVector attrs = {npuAttrKsize, - npuAttrStrides, - npuAttrMode, - npuAttrPadding, - npuAttrDilation, - npuAttrGlobalPooling, - npuAttrCeilmode}; - - return attrs; -} - -Tensor& quantized_max_pool2d_out_npu( - Tensor& output, - const Tensor& self, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode) { - // constructs the input and output NPUTensorDesc - auto inputs = quantized_max_pool2d_npu_input({self}); - auto outputs = quantized_max_pool2d_npu_output({output}); - - // constructs the attr of the NPUAttrDesc - auto attrs = quantized_max_pool2d_npu_attr( - kernel_size, stride, padding, dilation, ceil_mode); - - // executing the NPU operator - CalcuOpUtil::execute_npu_operate( - "Pooling", inputs, outputs, attrs); - - return output; -} - -Tensor quantized_max_pool2d_npu( - const Tensor& self, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - bool ceil_mode) { - // calculate the output size - auto outputSizes = quantized_max_pool2d_npu_output_size( - self, kernel_size, stride, padding, dilation, ceil_mode); - - // construct the output tensor of the NPU - Tensor output = at::empty_with_format( - outputSizes, self.options(), ACL_FORMAT_NC1HWC0); - - // calculate the output result of the NPU - quantized_max_pool2d_out_npu( - output, self, kernel_size, stride, padding, dilation, ceil_mode); - return output; -} - -} // namespace native -} // namespace at diff --git a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp index c919f9cd027da080114d99707dc58187f159e3a9..d299fde881a3b0778ed46ca6777ea039a7de41fd 100644 --- a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp @@ -29,43 +29,31 @@ tuple std_mean_out_npu_nocheck( bool unbiased, bool keepdim) { // executing the NPU operator - if (!c10::npu::OptionsManager::CheckDynamicEnable()) { - OpCommand cmd; - cmd.Name("ReduceStd") - .Input(self) - .Output(resultStd) - .Output(resultMean) - .Attr("dim", dim) - .Attr("unbiased", unbiased) - .Attr("keepdim", keepdim) - .Run(); - } else { - OpCommand cmd1; - cmd1.Name("ReduceMeanD") - .Input(self) - .Output(resultMean) - .Attr("axes", dim) - .Attr("keep_dims", keepdim) - .Run(); - Tensor resultMeanCopy = resultMean; - if (resultMean.dim() != 0 && keepdim == false) { - auto dimVector = array_to_small_vector(dim); - std::sort(dimVector.begin(), dimVector.end()); - for (int64_t i = 0; i < dimVector.size(); i++) { - resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]); - } + OpCommand cmd1; + cmd1.Name("ReduceMeanD") + .Input(self) + .Output(resultMean) + .Attr("axes", dim) + .Attr("keep_dims", keepdim) + .Run(); + Tensor resultMeanCopy = resultMean; + if (resultMean.dim() != 0 && keepdim == false) { + auto dimVector = array_to_small_vector(dim); + std::sort(dimVector.begin(), dimVector.end()); + for (int64_t i = 0; i < dimVector.size(); i++) { + resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]); } - resultMeanCopy = resultMeanCopy.expand(self.sizes()); - OpCommand cmd2; - cmd2.Name("ReduceStdWithMean") - .Input(self) - .Input(resultMeanCopy) - .Output(resultStd) - .Attr("dim", dim) - .Attr("unbiased", unbiased) - .Attr("keepdim", keepdim) - .Run(); } + resultMeanCopy = resultMeanCopy.expand(self.sizes()); + OpCommand cmd2; + cmd2.Name("ReduceStdWithMean") + .Input(self) + .Input(resultMeanCopy) + .Output(resultStd) + .Attr("dim", dim) + .Attr("unbiased", unbiased) + .Attr("keepdim", keepdim) + .Run(); return std::tie(resultStd, resultMean); } diff --git a/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp index ff95b80072a315ec489e165e9f09c83a878437be..4f1944c839e8ee0bc90d8a9a221a190600d2938c 100644 --- a/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp @@ -47,7 +47,7 @@ static inline void yolo_boxes_encode_check( "gt_bboxes num ", gt_bboxes.size(0)); TORCH_CHECK( - at::isIntegralType(stride.scalar_type()) && stride.scalar_type() != ScalarType::Long, + at::isIntegralType(stride.scalar_type(), true) && stride.scalar_type() != ScalarType::Long, "int32 strdie tensor expected but got a tensor with dtype: ", stride.scalar_type()); } diff --git a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp index 73dcdd8ec5cccb9633c455dee0a320dd05ea70cc..856903449f49cc5039d27cb643998ad2182845a4 100644 --- a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp +++ b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp @@ -16,11 +16,13 @@ #include "OpParamMaker.h" #include #include "c10/npu/NPUQueue.h" +#include "c10/npu/NPUCachingAllocator.h" #include #include "ATen/native/npu/aoe/AutoTune.h" #include "ATen/native/npu/utils/DynamicShapeUtil.h" #include "ATen/native/npu/utils/NpuFuzzyBlacklist.h" #include "ATen/native/npu/utils/CalcuOpUtil.h" +#include "ATen/native/npu/utils/NpuUtils.h" #include "ATen/native/npu/interface/EnvVariables.h" namespace at { @@ -161,19 +163,24 @@ aclError OpCommandImpl::InnerRun(string name, AclExecParam& params) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT); reset_flag = true; } - auto ret = aclopCompileAndExecute( - name.c_str(), - inputSize, - params.inDesc.data(), - params.inBuffer.data(), - outputSize, - params.outDesc.data(), - params.outBuffer.data(), - params.attr, - ACL_ENGINE_SYS, - ACL_COMPILE_SYS, - NULL, - stream); + aclError ret; + int index = 0; + do { + ret = aclopCompileAndExecute( + name.c_str(), + inputSize, + params.inDesc.data(), + params.inBuffer.data(), + outputSize, + params.outDesc.data(), + params.outBuffer.data(), + params.attr, + ACL_ENGINE_SYS, + ACL_COMPILE_SYS, + NULL, + stream); + ++index; + } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM)); if (reset_flag) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ); } @@ -194,7 +201,9 @@ int ExecFunc(void* in, aclrtStream stream) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT); reset_flag = true; } - ret = aclopCompileAndExecute( + int index = 0; + do { + ret = aclopCompileAndExecute( (cur_paras->opType).c_str(), cur_paras->paras.input_num, cur_paras->paras.input_desc, @@ -207,6 +216,8 @@ int ExecFunc(void* in, aclrtStream stream) { ACL_COMPILE_SYS, nullptr, stream); + ++index; + } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM)); if (reset_flag) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ); } diff --git a/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp index 46084f0740dceb4b9058a045cc90bd67e076c264..8cb984be1193c1d52c55bf9737b38f5eeda55812 100644 --- a/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp @@ -211,8 +211,8 @@ tuple batch_norm_backward_npu( // construct the output tensor of the NPU Tensor grad_input = OpPreparation::ApplyTensor(self_4d.sizes(), self_4d.options(), self_4d); - Tensor grad_weight = OpPreparation::ApplyTensor(weight_tensor.sizes(), weight_tensor.options(), weight_tensor); - Tensor grad_bias = OpPreparation::ApplyTensor(weight_tensor.sizes(), weight_tensor.options(), weight_tensor); + Tensor grad_weight = OpPreparation::ApplyTensor(weight_tensor, weight_tensor.options().dtype(ScalarType::Float)); + Tensor grad_bias = OpPreparation::ApplyTensor(weight_tensor, weight_tensor.options().dtype(ScalarType::Float)); // calculate the output result of the NPU batch_norm_backward_impl( diff --git a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp index aae01114039912bbbeb7f35270fc8362492c9264..0862fa5d7bda43a7dce5e3ac9bb115c58a29dc4b 100644 --- a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp +++ b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp @@ -20,8 +20,8 @@ namespace at { namespace native { using namespace at::native::npu; -Tensor& avg_pool2d_out_npu( - Tensor& out, +Tensor& avg_pool2d_out_npu_nocheck( + Tensor& result, const Tensor& self, IntArrayRef kernel_size, IntArrayRef stride, @@ -29,29 +29,74 @@ Tensor& avg_pool2d_out_npu( bool ceil_mode, bool count_include_pad, c10::optional divisor_override) { - string padding_str = ceil_mode ? "SAME" : "VALID"; + if (padding.size() == 1) { + SmallVector paddings = {padding[0], padding[0]}; + padding = IntArrayRef(paddings); + } + // required attr int64_t strideH = 1; int64_t strideW = 1; - if (!stride.empty()) { strideH = stride[0]; strideW = stride[1]; } - SmallVector kernelSize = {1, 1, kernel_size[0], kernel_size[1]}; SmallVector stridesSize = {1, 1, strideH, strideW}; + SmallVector pads = {padding[0], padding[0], padding[1], padding[1]}; + OpCommand cmd; - cmd.Name("AvgPool") - .Input(self) - .Output(out) - .Attr("ksize", kernelSize) - .Attr("strides", stridesSize) - .Attr("padding", padding_str) - .Attr("data_format", (string)"NCHW") - .Run(); - - return out; + cmd.Name("AvgPoolV2") + .Input(self) + .Output(result) + .Attr("ksize", kernelSize) + .Attr("strides", stridesSize) + .Attr("padding_mode", (string)"CALCULATED") + .Attr("pads", pads) + .Attr("data_format", (string)"NCHW") + .Attr("global_pooling", false) + .Attr("ceil_mode", ceil_mode) + .Attr("exclusive", true) + .Run(); + + return result; +} + +Tensor& avg_pool2d_out_npu( + Tensor& result, + const Tensor& self, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + c10::optional divisor_override) { + auto outputSize = avg_pool2d_npu_output_size( + self, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override); + + OpPreparation::CheckOut( + {self}, + result, + self, + outputSize); + + avg_pool2d_out_npu_nocheck( + result, + self, + kernel_size, + stride, + padding, + ceil_mode, + count_include_pad, + divisor_override); + + return result; } Tensor avg_pool2d_npu( diff --git a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp index c5e2eba6ecc3bfdcad270781a578eb2eb01461e2..412d1fc32b7bca4bb8f5d7bcac31eee8458a5bc8 100644 --- a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp +++ b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp @@ -638,38 +638,46 @@ void CalcuOpUtil::execute_npu_operate( NPU_LOGD("Op %s aclopCompileAndExecute Run.", opName.c_str()); if (PyGILState_Check()) { Py_BEGIN_ALLOW_THREADS - ACL_REQUIRE_OK_OP( - aclopCompileAndExecute( - opName.c_str(), - params.input_num, - params.input_desc, - params.input_data_buf, - params.output_num, - params.output_desc, - params.output_data_buf, - attr, - ACL_ENGINE_SYS, - ACL_COMPILE_SYS, - NULL, - stream), - opName.c_str()); + aclError ret; + int index = 0; + do { + ret = aclopCompileAndExecute( + opName.c_str(), + params.input_num, + params.input_desc, + params.input_data_buf, + params.output_num, + params.output_desc, + params.output_data_buf, + attr, + ACL_ENGINE_SYS, + ACL_COMPILE_SYS, + NULL, + stream); + ++index; + } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM)); + ACL_REQUIRE_OK_OP(ret, opName.c_str()); Py_END_ALLOW_THREADS } else { - ACL_REQUIRE_OK_OP( - aclopCompileAndExecute( - opName.c_str(), - params.input_num, - params.input_desc, - params.input_data_buf, - params.output_num, - params.output_desc, - params.output_data_buf, - attr, - ACL_ENGINE_SYS, - ACL_COMPILE_SYS, - NULL, - stream), - opName.c_str()); + aclError ret; + int index = 0; + do { + ret = aclopCompileAndExecute( + opName.c_str(), + params.input_num, + params.input_desc, + params.input_data_buf, + params.output_num, + params.output_desc, + params.output_data_buf, + attr, + ACL_ENGINE_SYS, + ACL_COMPILE_SYS, + NULL, + stream); + ++index; + } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM)); + ACL_REQUIRE_OK_OP(ret, opName.c_str()); } if (reset_flag) { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ); diff --git a/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp b/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp index f4b42b2c2eb01a00efc247db0c1955f9e9e7609b..c972055dce6aa289c6a79813cd8823465fe56cfb 100644 --- a/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp +++ b/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp @@ -17,6 +17,7 @@ #include "DynamicShapeUtil.h" #include #include +#include "ATen/native/npu/utils/NpuUtils.h" #include "ATen/native/npu/dynamicstrategy/Strategy.h" #include "ATen/native/npu/frame/OpDynamicCmdHelper.h" #include "ATen/native/npu/frame/OpDynamicParamMaker.h" @@ -331,7 +332,10 @@ aclError DynamicShapeUtil::ExecuteDynamic( ExecuteParas& cur_paras, aclrtStream stream) { auto params = OpDynamicCmdHelper::CreateDynamicRunParams(cur_paras); - return aclopExecuteV2( + aclError ret; + int index = 0; + do { + ret = aclopExecuteV2( std::get<0>(params).c_str(), std::get<1>(params), const_cast(std::get<2>(params)), @@ -341,6 +345,10 @@ aclError DynamicShapeUtil::ExecuteDynamic( std::get<6>(params), const_cast(std::get<7>(params)), stream); + ++index; + } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM)); + + return ret; } void DynamicShapeUtil::staticCompileAndExecute( @@ -349,9 +357,11 @@ void DynamicShapeUtil::staticCompileAndExecute( aclrtStream stream) { std::string opName = cur_paras.opType; NPU_LOGD(" Op %s aclopCompileAndExecute Run.", opName.c_str()); - aclError ret; logUtil.SetStartTime(); - ret = aclopCompileAndExecute( + aclError ret; + int index = 0; + do { + ret = aclopCompileAndExecute( opName.c_str(), cur_paras.paras.input_num, cur_paras.paras.input_desc, @@ -364,7 +374,8 @@ void DynamicShapeUtil::staticCompileAndExecute( ACL_COMPILE_SYS, NULL, stream); - + ++index; + } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM)); if (ret != 0) { C10_NPU_SHOW_ERR_MSG(); logUtil.PrintLog(steps_, key, "Static Compile And Execute Failed"); diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp index 150711feecf70d648b02167e36b80eb5f29d18a1..773f25ab306403e43169c5a489625a18d9aea2a5 100644 --- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp +++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp @@ -603,7 +603,15 @@ SmallVector nonzero_npu_output_size(const Tensor& self){ int64_t dim = self.dim(); Tensor boolSelf = self.npu_dtype_cast(ScalarType::Bool); Tensor intSelf = boolSelf.npu_dtype_cast(ScalarType::Int); - Tensor coutNonzeroSelf = at::sum(intSelf, ScalarType::Int); + + Tensor coutNonzeroSelf = intSelf; + if (self.numel() > 10000000) { + //Ensure outputsize correctly in large shape case + coutNonzeroSelf = at::sum(intSelf, ScalarType::Long); + } else { + coutNonzeroSelf = at::sum(intSelf, ScalarType::Int); + } + int64_t nonzeroNum = coutNonzeroSelf.item().toInt(); SmallVector outputSize = {nonzeroNum, dim}; return outputSize; diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp index daa10e3ab1c6e331dc08102e8d7894a3a2ba8f16..368b1478c02da837f59b247dd7cdc7373effc36a 100644 --- a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp +++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp @@ -16,7 +16,6 @@ #include #include "NpuUtils.h" -#include "c10/npu/NPUCachingAllocator.h" #include "c10/npu/register/OptionRegister.h" #include "CalcuOpUtil.h" @@ -312,7 +311,20 @@ Tensor NpuUtils::format_contiguous_add_copy_optimize(const Tensor& src) { return src; } - +bool NpuUtils::IsOomError(aclError ret, int index) +{ + if (ret == ACL_ERROR_GE_DEVICE_MEMORY_ALLOCATION_FAILED) { + int deviceId = 0; + // free devcie cached memory when return value of the first op execution is oom + if (index == 1) { + C10_NPU_CHECK(aclrtGetDevice(&deviceId)); + c10::npu::NPUCachingAllocator::FreeDeviceCachedMemory(deviceId); + return true; + } + AT_ERROR("NPU out of memory. device id: ", deviceId); + } + return false; +} } // namespace npu } // namespace native } // namespace at diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.h b/src/aten/src/ATen/native/npu/utils/NpuUtils.h index d9797e289977defac21ded7f2ed0793debf6ec5c..34849d55cad36c5333a20af8bc900313b8c8c2a4 100644 --- a/src/aten/src/ATen/native/npu/utils/NpuUtils.h +++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.h @@ -18,10 +18,12 @@ #define __NATIVE_NPU_UTILS_NUP_UTILS__ #include +#include "c10/npu/NPUCachingAllocator.h" #include #include #include #include +#include #include #include #include "ATen/ATen.h" @@ -41,6 +43,7 @@ const int SHAPE_SIZE = 8; // HALF_MAX and HALF_MIN of NPU support const int NPU_HALF_MAX = 65504; const int NPU_HALF_MIN = -65504; +const int NPU_MAX_OP_EXEC_TRY_NUM = 2; typedef enum MemoryType{ MEMORY_DEVICE, @@ -59,6 +62,7 @@ class NpuUtils { const Tensor& y); static bool check_5d_5d_match(const Tensor& tensor); + static bool IsOomError(aclError ret, int index); }; } // namespace npu } // namespace native diff --git a/src/c10/npu/NPUCachingAllocator.cpp b/src/c10/npu/NPUCachingAllocator.cpp index f179b6b23d50fbe7d6e31cc2c8109259305d78e9..cbda658019122640baf163d0297ae41e2f6819a6 100644 --- a/src/c10/npu/NPUCachingAllocator.cpp +++ b/src/c10/npu/NPUCachingAllocator.cpp @@ -441,6 +441,7 @@ struct THNCachingAllocator { void emptyCache() { std::lock_guard lock(mutex); synchronize_and_free_events(nullopt); + c10::npu::npuSynchronizeDevice(); free_blocks(large_blocks, large_blocks.begin(), large_blocks.end()); free_blocks(small_blocks, small_blocks.begin(), small_blocks.end()); } @@ -774,6 +775,7 @@ struct THNCachingAllocator { Block lower_bound(device, nullptr, 0); Block upper_bound(device + 1, nullptr, 0); + c10::npu::npuSynchronizeDevice(); free_blocks( large_blocks, large_blocks.lower_bound(&lower_bound), @@ -1195,6 +1197,11 @@ void raw_delete(void* ptr) { caching_allocator.free(ptr); } +void FreeDeviceCachedMemory(int device) +{ + caching_allocator.free_cached_blocks(device); + +} } // namespace NPUCachingAllocator } // namespace npu diff --git a/src/c10/npu/NPUCachingAllocator.h b/src/c10/npu/NPUCachingAllocator.h index 4c38309b4be18e723fda8353112af5d47510d2d8..5388f7bb5ecf20f7c81c54f87440f8b18eb107f8 100644 --- a/src/c10/npu/NPUCachingAllocator.h +++ b/src/c10/npu/NPUCachingAllocator.h @@ -143,6 +143,7 @@ C10_NPU_API std::mutex* getFreeMutex(); C10_NPU_API std::shared_ptr getIpcDevPtr(std::string handle); +C10_NPU_API void FreeDeviceCachedMemory(int device); } // namespace NPUCachingAllocator } // namespace npu diff --git a/src/third_party/acl/inc/ge/ge_error_codes.h b/src/third_party/acl/inc/ge/ge_error_codes.h new file mode 100644 index 0000000000000000000000000000000000000000..b0f8644463c9a7cc7b760ef184b5d37869e9d605 --- /dev/null +++ b/src/third_party/acl/inc/ge/ge_error_codes.h @@ -0,0 +1,76 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INC_EXTERNAL_GE_GE_ERROR_CODES_H_ +#define INC_EXTERNAL_GE_GE_ERROR_CODES_H_ + +#if defined(_MSC_VER) +#ifdef FUNC_VISIBILITY +#define GE_FUNC_VISIBILITY _declspec(dllexport) +#else +#define GE_FUNC_VISIBILITY +#endif +#else +#ifdef FUNC_VISIBILITY +#define GE_FUNC_VISIBILITY __attribute__((visibility("default"))) +#else +#define GE_FUNC_VISIBILITY +#endif +#endif + +#include + +#ifdef __cplusplus +extern "C" { +#endif +static const uint32_t ACL_ERROR_GE_PARAM_INVALID = 145000; +static const uint32_t ACL_ERROR_GE_EXEC_NOT_INIT = 145001; +static const uint32_t ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID = 145002; +static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ID_INVALID = 145003; +static const uint32_t ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID = 145006; +static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ADDR_INVALID = 145007; +static const uint32_t ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID = 145008; +static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_REPEATED = 145009; +static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_ADDR_INVALID = 145011; +static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_LENGTH_INVALID = 145012; +static const uint32_t ACL_ERROR_GE_DYNAMIC_BATCH_SIZE_INVALID = 145013; +static const uint32_t ACL_ERROR_GE_AIPP_BATCH_EMPTY = 145014; +static const uint32_t ACL_ERROR_GE_AIPP_NOT_EXIST = 145015; +static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016; +static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017; +static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018; +static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019; +static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020; +static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021; +static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022; +static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000; +static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001; +static const uint32_t ACL_ERROR_GE_DEVICE_MEMORY_ALLOCATION_FAILED = 245002; +static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000; +static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001; +static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002; +static const uint32_t ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED = 545003; +static const uint32_t ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED = 545004; +static const uint32_t ACL_ERROR_GE_EXEC_LOAD_KERNEL_PARTITION_FAILED = 545005; +static const uint32_t ACL_ERROR_GE_EXEC_RELEASE_MODEL_DATA = 545006; +static const uint32_t ACL_ERROR_GE_COMMAND_HANDLE = 545007; +static const uint32_t ACL_ERROR_GE_GET_TENSOR_INFO = 545008; +static const uint32_t ACL_ERROR_GE_UNLOAD_MODEL = 545009; + +#ifdef __cplusplus +} // namespace ge +#endif +#endif // INC_EXTERNAL_GE_GE_ERROR_CODES_H_ diff --git a/src/tools/autograd/derivatives.yaml b/src/tools/autograd/derivatives.yaml index 046aad5032c2ef0e38c53ab8b859e2703fd5cf9d..1db83b1c5a6a2870f5721b3d2483ec24b45e2ab3 100644 --- a/src/tools/autograd/derivatives.yaml +++ b/src/tools/autograd/derivatives.yaml @@ -1644,7 +1644,7 @@ - name: nonzero(Tensor self) -> Tensor output_differentiability: [False] -- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) +- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) output_differentiability: [True, True, True, False, False, False, False, False] input, weight, bias, h, c: npu_lstm_backward(grads[0], grads[1], grads[2], input, weight, bias, h, c, result0, result1, result2, result3, result4, result5, result6, result7) @@ -1687,4 +1687,8 @@ input, weight, offset, bias: npu_deformable_conv2dbk(input, grad, result1, weight, offset, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated) - name: npu_mish(Tensor self) -> Tensor - self: npu_mish_backward(grad, self) \ No newline at end of file + self: npu_mish_backward(grad, self) + +- name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor + input, weight: npu_linear_backward(grad, input, weight) + bias: maybe_multiply(grad, 1) \ No newline at end of file diff --git a/test/test_npu/test_avg_pool2d_backward.py b/test/test_npu/test_avg_pool2d_backward.py index 28beefd1f72eba3b0535bf268c5dd90074d724c8..62cf003d92d98adfb3c0a4e32e357b59ff34138e 100644 --- a/test/test_npu/test_avg_pool2d_backward.py +++ b/test/test_npu/test_avg_pool2d_backward.py @@ -56,7 +56,7 @@ class TestAvgPool2dBackward(TestCase): [np.float32, 0, (64, 10, 16, 14)], [np.float32, 3, (256, 2048, 8, 8)], [np.float32, 4, (32, 1, 2, 2)], - [np.float32, 29, (10, 128, 16, 16)] + [np.float32, 0, (10, 128, 16, 16)] ] for item in shape_format: cpu_input, npu_input = create_common_tensor(item, 1, 100) diff --git a/test/test_npu/test_histc.py b/test/test_npu/test_histc.py deleted file mode 100644 index 7a80dc5b7af52e3d1af051ffcac01f0e824602a2..0000000000000000000000000000000000000000 --- a/test/test_npu/test_histc.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2020, Huawei Technologies.All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import numpy as np -import sys -from common_utils import TestCase, run_tests -from common_device_type import dtypes, instantiate_device_type_tests -from util_test import create_common_tensor - -class TestHistc(TestCase): - def generate_single_data(self, min_d, max_d, shape, dtype): - input1 = np.random.uniform(min_d, max_d, shape).astype(dtype) - npu_input = torch.from_numpy(input1) - - return npu_input - - def cpu_op_exec(self, input1, bins=100, min=0, max=0): - output = torch.histc(input1, bins=bins, min=min, max=max) - output = output.numpy() - return output - - def npu_op_exec(self, input1, bins=100, min=0, max=0): - input1 = input1.to("npu") - output = torch.histc(input1, bins=bins, min=min, max=max) - output = output.to("cpu") - output = output.numpy() - return output - - def test_histc_int32_1(self, device): - npu_input1 = self.generate_single_data(0, 100, (1000,), np.int32) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - cpu_output = cpu_output.astype(np.int32) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_int32_2(self, device): - npu_input1 = self.generate_single_data(0, 100, (20, 30, 2), np.int32) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - cpu_output = cpu_output.astype(np.int32) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_int32_3(self, device): - npu_input1 = self.generate_single_data(0, 100, (10000,), np.int32) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - cpu_output = cpu_output.astype(np.int32) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_int32_4(self, device): - npu_input1 = self.generate_single_data(0, 100, (10000,), np.int32) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=5000, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=5000, min=0, max=100) - cpu_output = cpu_output.astype(np.int32) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_int32_5(self, device): - npu_input1 = self.generate_single_data(0, 100, (1000,), np.int32) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50) - npu_output = self.npu_op_exec(npu_input1, bins=50) - cpu_output = cpu_output.astype(np.int32) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float32_1(self, device): - npu_input1 = self.generate_single_data(0, 100, (1000,), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float32_2(self, device): - npu_input1 = self.generate_single_data(0, 100, (20, 30, 2), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float32_3(self, device): - npu_input1 = self.generate_single_data(0, 100, (10000,), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float32_4(self, device): - npu_input1 = self.generate_single_data(0, 100, (1000,), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, bins=5000, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=5000, min=0, max=100) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float32_5(self, device): - npu_input1 = self.generate_single_data(0, 100, (1000,), np.float32) - cpu_output = self.cpu_op_exec(npu_input1, bins=50) - npu_output = self.npu_op_exec(npu_input1, bins=50) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float16_1(self, device): - npu_input1 = self.generate_single_data(0, 100, (1000,), np.float16) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - cpu_output = cpu_output.astype(np.float16) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float16_2(self, device): - npu_input1 = self.generate_single_data(0, 100, (20, 30, 2), np.float16) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - cpu_output = cpu_output.astype(np.float16) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float16_3(self, device): - npu_input1 = self.generate_single_data(0, 100, (10000,), np.float16) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100) - cpu_output = cpu_output.astype(np.float16) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float16_4(self, device): - npu_input1 = self.generate_single_data(0, 100, (10000,), np.float16) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=5000, min=0, max=100) - npu_output = self.npu_op_exec(npu_input1, bins=5000, min=0, max=100) - cpu_output = cpu_output.astype(np.float16) - self.assertRtolEqual(cpu_output, npu_output) - - def test_histc_float16_5(self, device): - npu_input1 = self.generate_single_data(0, 100, (1000,), np.float16) - cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50) - npu_output = self.npu_op_exec(npu_input1, bins=50) - cpu_output = cpu_output.astype(np.float16) - self.assertRtolEqual(cpu_output, npu_output) - - -instantiate_device_type_tests(TestHistc, globals(), except_for='cpu') -if __name__ == '__main__': - torch.npu.set_device("npu:1") - run_tests() diff --git a/test/test_npu/test_mode.py b/test/test_npu/test_mode.py deleted file mode 100644 index be599e22abe599d2ab1a9338cfb7f8cebe0b81ea..0000000000000000000000000000000000000000 --- a/test/test_npu/test_mode.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2020, Huawei Technologies.All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import numpy as np -import torch -import torch.nn as nn -from common_device_type import dtypes, instantiate_device_type_tests -from common_utils import TestCase, run_tests -from util_test import create_common_tensor - -#pylint: disable=import-error -#pylint: disable=too-many-lines -#pylint: disable=too-many-arguments -#pylint: disable=unused-variable -#pylint: disable=unused-argument - -class TestMode(TestCase): - - def generate_data_1(self, dtype): - input = np.array([[10, 11, 12, 11, 10, 10, 10, 11], - [11, 11, 11, 10, 11, 10, 10, 11], - [12, 10, 10, 12, 10, 11, 10, 13], - [12, 10, 11, 12, 11, 11, 10, 13], - [14, 11, 11, 12, 10, 11, 10, 13]]).astype(np.float32) - - # modify from numpy.ndarray to torch.tensor - npu_input = torch.from_numpy(input) - return npu_input - - def generate_data_2(self, dtype): - input = np.array([[10, 11, 12, 11, 10], - [11, 10, 11, 10, 11], - [12, 10, 10, 12, 10], - [12, 10, 11, 13, 11], - [14, 11, 11, 12, 10]]).astype(np.float32) - - # modify from numpy.ndarray to torch.tensor - npu_input = torch.from_numpy(input) - return npu_input - - - def generate_data_3(self, dtype): - input = np.zeros((36,25)).astype(np.float32) - input[:,2]=1 - input[:,12]=2 - # modify from numpy.ndarray to torch.tensor - npu_input = torch.from_numpy(input) - return npu_input - - def generate_data_4(self, dtype): - input = np.zeros((12,12)).astype(np.float32) - # modify from numpy.ndarray to torch.tensor - input[:,2]=1 - input[:,10]=2 - npu_input = torch.from_numpy(input) - return npu_input - - def cpu_op_exec_0(self, input): - output1, output2 = torch.mode(input, 0, keepdim=False) - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def npu_op_exec_0(self, input): - input = input.to("npu") - output1, output2 = torch.mode(input, 0, keepdim=False) - output1 = output1.to("cpu") - output2 = output2.to("cpu") - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def cpu_op_exec_1(self, input): - output1, output2 = torch.mode(input, 1, keepdim=False) - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def npu_op_exec_1(self, input): - input = input.to("npu") - output1, output2 = torch.mode(input, 1, keepdim=False) - output1 = output1.to("cpu") - output2 = output2.to("cpu") - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def _cpu_op_exec_0(self, input): - output1, output2 = torch._mode(input, 0, keepdim=False) - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def _npu_op_exec_0(self, input): - input = input.to("npu") - output1, output2 = torch._mode(input, 0, keepdim=False) - output1 = output1.to("cpu") - output2 = output2.to("cpu") - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def _cpu_op_exec_1(self, input): - output1, output2 = torch._mode(input, 1, keepdim=False) - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def _npu_op_exec_1(self, input): - input = input.to("npu") - output1, output2 = torch._mode(input, 1, keepdim=False) - output1 = output1.to("cpu") - output2 = output2.to("cpu") - output1 = output1.numpy() - output2 = output2.numpy() - - return output1, output2 - - def test_add_float32_0(self, device): - npu_input = self.generate_data_1(np.float32) - cpu_output1, cpu_output2 = self.cpu_op_exec_0(npu_input) - npu_output1, npu_output2 = self.npu_op_exec_0(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_add_float32_1(self, device): - npu_input = self.generate_data_2(np.float32) - cpu_output1, cpu_output2 = self.cpu_op_exec_1(npu_input) - npu_output1, npu_output2 = self.npu_op_exec_1(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_add_float32_2(self, device): - npu_input = self.generate_data_3(np.float32) - cpu_output1, cpu_output2 = self.cpu_op_exec_0(npu_input) - npu_output1, npu_output2 = self.npu_op_exec_0(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_add_float32_3(self, device): - npu_input = self.generate_data_4(np.float32) - cpu_output1, cpu_output2 = self.cpu_op_exec_0(npu_input) - npu_output1, npu_output2 = self.npu_op_exec_0(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_add_float32_4(self, device): - npu_input = self.generate_data_2(np.float32) - cpu_output1, cpu_output2 = self._cpu_op_exec_0(npu_input) - npu_output1, npu_output2 = self._npu_op_exec_0(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_add_float32_5(self, device): - npu_input = self.generate_data_1(np.float32) - cpu_output1, cpu_output2 = self._cpu_op_exec_1(npu_input) - npu_output1, npu_output2 = self._npu_op_exec_1(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_add_float32_6(self, device): - npu_input = self.generate_data_3(np.float32) - cpu_output1, cpu_output2 = self._cpu_op_exec_0(npu_input) - npu_output1, npu_output2 = self._npu_op_exec_0(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - - def test_add_float32_7(self, device): - npu_input = self.generate_data_4(np.float32) - cpu_output1, cpu_output2 = self._cpu_op_exec_1(npu_input) - npu_output1, npu_output2 = self._npu_op_exec_1(npu_input) - self.assertRtolEqual(cpu_output1, npu_output1) - self.assertRtolEqual(cpu_output2, npu_output2) - -instantiate_device_type_tests(TestMode, globals(), except_for="cpu") -if __name__ == "__main__": - torch.npu.set_device("npu:2") - run_tests() \ No newline at end of file diff --git a/test/test_npu/test_adaptive_avg_pool1d.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py similarity index 88% rename from test/test_npu/test_adaptive_avg_pool1d.py rename to test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py index 44f92176967bb2cf65207fe21085a76d7b12b592..662cae2af3231941d335f5aa27f24af512b06a66 100644 --- a/test/test_npu/test_adaptive_avg_pool1d.py +++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py @@ -33,21 +33,21 @@ class TestAdaptiveAvgPool1d(TestCase): def test_AdaptiveAvgPool1d_shape_format_fp16(self, device): shape_format = [ [np.float16, 0, (64, 10, 16)], - [np.float16, 1, (256, 2048, 8)], + [np.float16, -1, (256, 2048, 8)], [np.float16, 3, (32, 16, 16)] ] - output_list = [(4), (3), (1)] + output_list = [(4), (3)] for item in shape_format: cpu_input, npu_input = create_common_tensor(item, 1, 10) for output_size in output_list: cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) - self.assertRtolEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output, prec16=0.002) def test_AdaptiveAvgPool1d_shape_format_fp32(self, device): shape_format = [ [np.float32, 0, (64, 10, 16)], - [np.float32, 1, (256, 2048, 8)], + [np.float32, -1, (256, 2048, 8)], [np.float32, 3, (32, 16, 16)] ] output_list = [(4), (3), (1)] @@ -56,11 +56,8 @@ class TestAdaptiveAvgPool1d(TestCase): for output_size in output_list: cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) - self.assertRtolEqual(cpu_output, npu_output) + self.assertRtolEqual(cpu_output, npu_output, 0.001) instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:6") run_tests() - - diff --git a/test/test_npu/test_adaptive_avg_pool3d.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py similarity index 85% rename from test/test_npu/test_adaptive_avg_pool3d.py rename to test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py index 937e3eac27ed984374ee53543791c616d3a86564..859cccf3cbb1470177b87472a518fe4d0c06f870 100644 --- a/test/test_npu/test_adaptive_avg_pool3d.py +++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py @@ -32,10 +32,10 @@ class TestAdaptiveAvgPool3d(TestCase): def test_AdaptiveAvgPool3d_shape_format_fp16(self, device): shape_format = [ - [np.float16, 0, (64, 10, 16, 32)], - [np.float16, 0, (4, 16, 8, 4, 2)], - [np.float16, 29, (2, 16, 4, 32)], - [np.float16, 29, (4, 16, 8, 4, 16)] + [np.float16, -1, (64, 10, 16, 32)], + [np.float16, -1, (4, 16, 8, 4, 2)], + [np.float16, -1, (2, 16, 4, 32)], + [np.float16, -1, (4, 16, 8, 4, 16)] ] # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)] output_list = [(1, 1, 1)] @@ -50,10 +50,10 @@ class TestAdaptiveAvgPool3d(TestCase): def test_AdaptiveAvgPool3d_shape_format_fp32(self, device): shape_format = [ - [np.float32, 0, (64, 10, 16, 32)], - [np.float32, 0, (4, 2, 2, 4, 316)], - [np.float32, 29, (2, 16, 4, 32)], - [np.float32, 29, (4, 16, 8, 4, 16)] + [np.float32, -1, (64, 10, 16, 32)], + [np.float32, -1, (4, 2, 2, 4, 316)], + [np.float32, -1, (2, 16, 4, 32)], + [np.float32, -1, (4, 16, 8, 4, 16)] ] # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)] output_list = [(1, 1, 1)] @@ -67,5 +67,3 @@ class TestAdaptiveAvgPool3d(TestCase): instantiate_device_type_tests(TestAdaptiveAvgPool3d, globals(), except_for="cpu") if __name__ == "__main__": run_tests() - - diff --git a/test/test_npu/test_adaptive_avg_pool3d_backward.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py similarity index 98% rename from test/test_npu/test_adaptive_avg_pool3d_backward.py rename to test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py index 2948fa9549e0788a29530198e0e945079d91f51d..c3dc9a48430dbc337faa1ac4895b7563883584e2 100644 --- a/test/test_npu/test_adaptive_avg_pool3d_backward.py +++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py @@ -42,7 +42,7 @@ class TestAdaptiveAvgPool3dBackward(TestCase): def test_adaptiveAvgPool3d_backward(self, device): dtype_list = [np.float16, np.float32] - format_list = [0, 29] + format_list = [-1] shape_list = [ [2, 3, 7, 7], [1, 2, 3, 6, 6], diff --git a/test/test_npu/test_adaptive_max_pool2d.py b/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py similarity index 91% rename from test/test_npu/test_adaptive_max_pool2d.py rename to test/test_npu/test_network_ops/test_adaptive_max_pool2d.py index b807a569a7987408728258d3912dc96d478a896a..877f50c11c26fb787a491cae9fcfc7b2957db0a9 100644 --- a/test/test_npu/test_adaptive_max_pool2d.py +++ b/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py @@ -32,9 +32,9 @@ class TestAdaptiveMaxPool2d(TestCase): return output.cpu().numpy() def test_adaptiveMaxPool2d_shape_format_fp32_6(self, device): - format_list = [0, 3] - shape_list = [(1, 5, 9, 9), - (1, 8, 9)] + format_list = [-1] + # (1, 8, 9) IndexError + shape_list = [(1, 5, 9, 9)] shape_format = [ [np.float32, i, j] for i in format_list for j in shape_list ] @@ -44,7 +44,8 @@ class TestAdaptiveMaxPool2d(TestCase): for output_size in output_list: cpu_output = self.cpu_op_exec(cpu_input, output_size) npu_output = self.npu_op_exec(npu_input, output_size) - self.assertRtolEqual(cpu_output, npu_output) + + self.assertRtolEqual(cpu_output, npu_output, 0.0004) instantiate_device_type_tests(TestAdaptiveMaxPool2d, globals(), except_for="cpu") diff --git a/test/test_npu/test_quantized_max_pool2d.py b/test/test_npu/test_network_ops/test_avg_pool2d.py similarity index 32% rename from test/test_npu/test_quantized_max_pool2d.py rename to test/test_npu/test_network_ops/test_avg_pool2d.py index 4f4be2ee747ec92abc063b6475118a0e205a1175..6042069f8de3840e11a894f355da879a2319098a 100644 --- a/test/test_npu/test_quantized_max_pool2d.py +++ b/test/test_npu/test_network_ops/test_avg_pool2d.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import torch import torch.nn as nn import numpy as np @@ -20,47 +19,45 @@ from common_device_type import dtypes, instantiate_device_type_tests from util_test import create_common_tensor -class TestQuantizedMaxPool2d(TestCase): - def cpu_op_exec(self, input, kernel_size, stride, padding, dilation, ceil_mode): - output = nn.quantized.functional.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode) - return output.numpy() +class TestAvgPool2d(TestCase): + def cpu_op_exec(self, input, ceil_mode): + m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode) + output = m(input) + output = output.detach().numpy() + return output - def npu_op_exec(self, input, ksize, stride, padding, dilation, ceil_mode): - output = nn.quantized.functional.max_pool2d(input, ksize, stride, padding, dilation, ceil_mode) - return output.cpu().numpy() + def npu_op_exec(self, input, ceil_mode): + m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode).npu() + output = m(input) + output = output.to("cpu") + output = output.detach().numpy() + return output - def test_quantized_max_pool2d_shape_format_fp16(self, device): - format_list = [0] - shape_list = [(32, 16, 16, 16), - (16, 1024, 256, 20), - (1024, 464, 11, 9), - (1, 2048, 15, 15)] - ksize_list = [(2, 2), (3, 3)] - stride_list = [(1, 1), (2, 2)] - padding_list = [(0, 0), (1, 1)] - dilation_list = [1] - ceil_mode_list = [False, True] + def test_avg_pool2d_backward_shape_format_fp16(self, device): shape_format = [ - [np.float16, i, j, k, m, n, o, p] for i in format_list for j in shape_list for k in ksize_list for m in stride_list for n in padding_list for o in dilation_list for p in ceil_mode_list + [[np.float16, 0, (1, 3, 147, 147)], True], + [[np.float16, 0, (1, 3, 147, 147)], True] ] - # TODO(Ascend): tbe operator has problem in precision and (x, 1) case and so on. + for item in shape_format: - cpu_input, npu_input = create_common_tensor(item, 0, 100) - #npu_input = cpu_input + cpu_input, npu_input = create_common_tensor(item[0], 0, 100) cpu_input = cpu_input.to(torch.float32) - qu = torch.nn.quantized.Quantize(1.0, 50, torch.qint8) - cpu_input = qu(cpu_input) - npu_input = qu(npu_input) - #npu_input.to("npu") - cpu_output = self.cpu_op_exec(cpu_input, item[3], item[4], item[5], item[6], ceil_mode=item[7]) - print(item) - print(cpu_output.shape) - npu_output = self.npu_op_exec(npu_input, item[3], item[4], item[5], item[6], ceil_mode=item[7]) - cpu_output = cpu_output.astype(np.float16) - self.assertRtolEqual(cpu_output, npu_output) + cpu_output = self.cpu_op_exec(cpu_input.float(), item[1]).astype(np.float16) + npu_output = self.npu_op_exec(npu_input, item[1]) + self.assertRtolEqual(cpu_output, npu_output, prec16=0.002) + def test_avg_pool2d_backward_shape_format_fp32(self, device): + shape_format = [ + [[np.float32, 0, (1, 3, 147, 147)], True], + [[np.float32, 0, (1, 3, 147, 147)], True] + ] + + for item in shape_format: + cpu_input, npu_input = create_common_tensor(item[0], 0, 100) + cpu_output = self.cpu_op_exec(cpu_input, item[1]) + npu_output = self.npu_op_exec(npu_input, item[1]) + self.assertRtolEqual(cpu_output, npu_output, 0.0009) -instantiate_device_type_tests(TestQuantizedMaxPool2d, globals(), except_for="cpu") +instantiate_device_type_tests(TestAvgPool2d, globals(), except_for="cpu") if __name__ == "__main__": - torch.npu.set_device("npu:1") run_tests() diff --git a/test/test_npu/test_cummin.py b/test/test_npu/test_network_ops/test_cummin.py similarity index 99% rename from test/test_npu/test_cummin.py rename to test/test_npu/test_network_ops/test_cummin.py index 52938df20aa7cc3b3a225b7b5b41e603a089a811..7118dd940b84fb6a23cb06a7553aa457c3a95a74 100644 --- a/test/test_npu/test_cummin.py +++ b/test/test_npu/test_network_ops/test_cummin.py @@ -51,7 +51,7 @@ class TestCummin(TestCase): def npu_op_exec_out(self, input_x, dim, output_value, output_argmin): input_x = input_x.to("npu") output_value = output_value.to("npu") - output_argmin = output_argmin.to("npu") + output_argmin = output_argmin.to("npu").to(torch.long) torch.cummin(input_x, dim, out=(output_value, output_argmin)) output_value = output_value.to("cpu") output_value = output_value.numpy() diff --git a/test/test_npu/test_network_ops/test_gt.py b/test/test_npu/test_network_ops/test_gt.py index 1f55b3581022898e760e47024fac1daf41fcaecf..d3ec28991001811d22a6eda7da3cb86b7ee4aa02 100644 --- a/test/test_npu/test_network_ops/test_gt.py +++ b/test/test_npu/test_network_ops/test_gt.py @@ -142,6 +142,24 @@ class TestGt(TestCase): self.assertRtolEqual(cpu_output_out, npu_output_out) + def test_gt_bool(self, device): + format_list = [0] + shape_list = [(5, 3), (2, 3, 4)] + scalar_list = [True, False] + shape_format = [ + [[np.int32, i, j], k] for i in format_list for j in shape_list + for k in scalar_list + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100) + cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100) + cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1]) + npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1]) + cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50) + npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50) + self.assertEqual(cpu_output1, npu_output1) + self.assertEqual(cpu_output2, npu_output2) + def test_gt_tensor_out(self, device): shape_format = [ [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]], diff --git a/test/test_npu/test_network_ops/test_index_copy.py b/test/test_npu/test_network_ops/test_index_copy.py new file mode 100644 index 0000000000000000000000000000000000000000..9b29510402dee189f77f237b9965dc546a24f791 --- /dev/null +++ b/test/test_npu/test_network_ops/test_index_copy.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from common_utils import TestCase, run_tests +from common_device_type import instantiate_device_type_tests + +class TestIndexCopy(TestCase): + def op_exec(self, npuflag, input1, dim, indices, updates): + output = torch.index_copy(input1, dim, indices, updates) + if npuflag: + output = output.to("cpu") + output = output.numpy() + return output + + def op_inp_exec(self, npuflag, input1, dim, indices, updates): + input1.index_copy_(dim, indices, updates) + if npuflag: + input1 = input1.to("cpu") + output = input1.numpy() + return output + + def op_inp_exec_(self, npuflag, input1, dim, indices, updates): + input1 = torch._index_copy_(input1, dim, indices, updates) + if npuflag: + input1 = input1.to("cpu") + output = input1.numpy() + return output + + def case_exec(self, input1, dim, indices, updates): + npu_input = input1.npu() + npu_indices = indices.npu() + npu_updates = updates.npu() + cpu_output = self.op_exec(0, input1, dim, indices, updates) + npu_output = self.op_exec(1, npu_input, dim, npu_indices, npu_updates) + self.assertEqual(cpu_output, npu_output) + cpu_output = self.op_inp_exec(0, input1, dim, indices, updates) + npu_output = self.op_inp_exec(1, npu_input, dim, npu_indices, npu_updates) + self.assertEqual(cpu_output, npu_output) + cpu_output = self.op_inp_exec_(0, input1, dim, indices, updates) + npu_output = self.op_inp_exec_(1, npu_input, dim, npu_indices, npu_updates) + self.assertEqual(cpu_output, npu_output) + + def test_index_copy_dim0_0(self, device): + a = torch.ones(5, dtype = torch.float32) + indices = torch.LongTensor([3, 2, 1, 0]) + updates = torch.tensor([1, 2, 3, 4], dtype = torch.float32) + self.case_exec(a, 0, indices, updates) + + def test_index_copy_dim0_1(self, device): + a = torch.ones(5, 3, dtype = torch.float32) + indices = torch.LongTensor([0, 1, 2]) + updates = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype = torch.float32) + self.case_exec(a, 0, indices, updates) + + def test_index_copy_dim0_2(self, device): + a = torch.ones(2, 5, 3, dtype = torch.float32) + indices = torch.LongTensor([0]) + updates = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]], dtype = torch.float32) + self.case_exec(a, 0, indices, updates) + + def test_index_copy_dim1_0(self, device): + a = torch.ones(5, 3, dtype = torch.float32) + indices = torch.LongTensor([0, 1]) + updates = torch.tensor([[1, 2], [5, 6], [8, 9], [3, 4], [0, 1]], dtype = torch.float32) + self.case_exec(a, 1, indices, updates) + + def test_index_copy_dim1_1(self, device): + a = torch.ones(2, 5, 3, dtype = torch.float32) + indices = torch.LongTensor([0]) + updates = torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype = torch.float32) + self.case_exec(a, 1, indices, updates) + + def test_index_copy_dim2_0(self, device): + a = torch.ones(2, 5, 3, dtype = torch.float32) + indices = torch.LongTensor([0]) + updates = torch.tensor([[[1], [2], [3], [4], [5]], + [[6], [7], [8], [9], [0]]], dtype = torch.float32) + self.case_exec(a, 2, indices, updates) + + def test_index_copy_dim2_1(self, device): + a = torch.ones(2, 5, 3, dtype = torch.float32) + indices = torch.LongTensor([0, 1]) + updates = torch.tensor([[[3, 2], [1, 2], [1, 3], [1, 4], [1, 5]], + [[1, 6], [1, 7], [1, 8], [1, 9], [1, 0]]], dtype = torch.float32) + self.case_exec(a, 2, indices, updates) + +instantiate_device_type_tests(TestIndexCopy, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() diff --git a/test/test_npu/test_lerp.py b/test/test_npu/test_network_ops/test_lerp.py similarity index 97% rename from test/test_npu/test_lerp.py rename to test/test_npu/test_network_ops/test_lerp.py index fc577185b0493d0972db304db4d22f6007c9de42..eeb0b7250506026e724a90a5578ecc805cab80b1 100644 --- a/test/test_npu/test_lerp.py +++ b/test/test_npu/test_network_ops/test_lerp.py @@ -167,7 +167,7 @@ class TestLerp(TestCase): return output shape_format = [ - [[np.float16, -1, (100, 4, 5, 5)], 1,2], + [[np.float16, -1, (100, 4, 5, 5)], 1.2], [[np.float16, -1, (100, 5, 5, 4)], 1.2], ] @@ -178,7 +178,7 @@ class TestLerp(TestCase): npu_input3 = item[1] cpu_output = cpu_op_scalar_exec_fp16(cpu_input1, cpu_input2, cpu_input3) npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3) - self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003) + self.assertRtolEqual(cpu_output, npu_output, prec16=0.02) def test_lerp_scalar_out_common_shape_format(self, device): @@ -218,7 +218,7 @@ class TestLerp(TestCase): npu_input3 = item[1] cpu_output = cpu_op_scalar_out_exec_fp16(cpu_input1, cpu_input2, cpu_input3) npu_output = self.npu_op_scalar_out_exec(npu_input1, npu_input2, npu_input3) - self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003) + self.assertRtolEqual(cpu_output, npu_output, prec16=0.02) instantiate_device_type_tests(TestLerp, globals(), except_for='cpu') if __name__ == '__main__': diff --git a/test/test_npu/test_network_ops/test_lstm.py b/test/test_npu/test_network_ops/test_lstm.py index ae03728ae62fabe9f71cb49307092bc6c73b789d..bf65ee9415b7584a518e3b35af5df8d39ed13e62 100644 --- a/test/test_npu/test_network_ops/test_lstm.py +++ b/test/test_npu/test_network_ops/test_lstm.py @@ -56,9 +56,12 @@ class TestLstm(TestCase): npu_input1 = torch.from_numpy(input1.astype(item[0][0])).npu() npu_output_y, (npu_output_h, npu_output_c) = npu_lstm(npu_input1) - self.assertRtolEqual(cpu_output_y.detach().numpy(), npu_output_y.cpu().to(torch.float).detach().numpy(), prec=1.e-1) - self.assertRtolEqual(cpu_output_h.detach().numpy(), npu_output_h.cpu().to(torch.float).detach().numpy(), prec=1.e-1) - self.assertRtolEqual(cpu_output_c.detach().numpy(), npu_output_c.cpu().to(torch.float).detach().numpy(), prec=1.e-1) + self.assertRtolEqual(cpu_output_y.detach().numpy(), + npu_output_y.cpu().to(torch.float).detach().numpy(), prec=1.e-3) + self.assertRtolEqual(cpu_output_h.detach().numpy(), + npu_output_h.cpu().to(torch.float).detach().numpy(), prec=1.e-3) + self.assertRtolEqual(cpu_output_c.detach().numpy(), + npu_output_c.cpu().to(torch.float).detach().numpy(), prec=1.e-3) def test_lstm_double_layer(self, device): # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size, is_training] @@ -112,8 +115,52 @@ class TestLstm(TestCase): self.assertRtolEqual(hn.detach().cpu().numpy(), hnf.cpu().detach().numpy()) self.assertRtolEqual(cn.detach().cpu().numpy(), cnf.cpu().detach().numpy()) + def test_lstm_bidirection(self, device): + # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size, is_training] + shape_format = [ + [[np.float16, (16, 32, 64)], [np.float16, (1, 32, 32)], 64, 32, True], + [[np.float16, (5, 32, 64)], [np.float16, (1, 32, 32)], 64, 32, False], + [[np.float32, (5, 32, 64)], [np.float16, (1, 32, 64)],64, 64, True], + [[np.float32, (5, 32, 64)], [np.float16, (1, 32, 64)], 64, 64, False], + [[np.float32, (26, 2560, 512)], [np.float16, (1, 2560, 256)], 512, 256, False], + [[np.float32, (10, 33, 128)], [np.float32, (1, 33, 64)], 128, 64, False], + ] + + for item in shape_format: + cpu_lstm = torch.nn.LSTM(input_size=item[2], hidden_size=item[3], + num_layers=1, bidirectional=True, bias=False) + cpu_lstm.training = item[4] + npu_lstm = copy.deepcopy(cpu_lstm).npu() + + cut_value = item[3] + iw = cpu_lstm.weight_ih_l0.split(cut_value) + hw = cpu_lstm.weight_hh_l0.split(cut_value) + iwr = cpu_lstm.weight_ih_l0_reverse.split(cut_value) + hwr = cpu_lstm.weight_hh_l0_reverse.split(cut_value) + iwt = torch.cat([iw[0], iw[2], iw[1], iw[3]], 0) + hwt = torch.cat([hw[0], hw[2], hw[1], hw[3]], 0) + iwrt = torch.cat([iwr[0], iwr[2], iwr[1], iwr[3]], 0) + hwrt = torch.cat([hwr[0], hwr[2], hwr[1], hwr[3]], 0) + cpu_lstm.weight_ih_l0.data = iwt + cpu_lstm.weight_hh_l0.data = hwt + cpu_lstm.weight_ih_l0_reverse.data = iwrt + cpu_lstm.weight_hh_l0_reverse.data = hwrt + + input1 = np.random.uniform(0, 1, item[0][1]).astype(np.float32) + + cpu_input1 = torch.from_numpy(input1) + cpu_output_y, (cpu_output_h, cpu_output_c) = cpu_lstm(cpu_input1) + + npu_input1 = torch.from_numpy(input1.astype(item[0][0])).npu() + npu_output_y, (npu_output_h, npu_output_c) = npu_lstm(npu_input1) + + self.assertRtolEqual(cpu_output_y.detach().numpy(), + npu_output_y.cpu().to(torch.float).detach().numpy(), prec=1.e-3) + self.assertRtolEqual(cpu_output_h.detach().numpy(), + npu_output_h.cpu().to(torch.float).detach().numpy(), prec=1.e-3) + self.assertRtolEqual(cpu_output_c.detach().numpy(), + npu_output_c.cpu().to(torch.float).detach().numpy(), prec=1.e-3) - #如下 测试接口 lstm.data def test_lstm_sequence(self, device): max_len = 6 embedding_size = 2 @@ -178,8 +225,141 @@ class TestLstm(TestCase): pade_outputs_npu, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs_npu, batch_first=False) self.assertRtolEqual(pade_outputs.detach().numpy(), - pade_outputs_npu.cpu().to(torch.float).detach().numpy(), prec=1.e-1) + pade_outputs_npu.cpu().to(torch.float).detach().numpy(), prec=1.e-4) + + def test_lstm_sequence_bidirection(self, device): + max_len = 6 + embedding_size = 2 + hidden_size = 16 + vocab_size = 20 + input_seq = [[3, 5, 12, 7, 2, ], [4, 11, 14, ], [18, 7, 3, 8, 5, 4]] + lengths = [5, 3, 6] + + # embedding + embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=0) + + rnn = torch.nn.LSTM(embedding_size, hidden_size, num_layers=1, bidirectional=True, bias=False) + rnn_npu = copy.deepcopy(rnn).npu() + + iw = rnn.weight_ih_l0.split(hidden_size) + hw = rnn.weight_hh_l0.split(hidden_size) + iwr = rnn.weight_ih_l0_reverse.split(hidden_size) + hwr = rnn.weight_hh_l0_reverse.split(hidden_size) + iwt = torch.cat([iw[0], iw[2], iw[1], iw[3]], 0) + hwt = torch.cat([hw[0], hw[2], hw[1], hw[3]], 0) + iwrt = torch.cat([iwr[0], iwr[2], iwr[1], iwr[3]], 0) + hwrt = torch.cat([hwr[0], hwr[2], hwr[1], hwr[3]], 0) + rnn.weight_ih_l0.data = iwt + rnn.weight_hh_l0.data = hwt + rnn.weight_ih_l0_reverse.data = iwrt + rnn.weight_hh_l0_reverse.data = hwrt + + #Sorting from Large to Small + input_seq = sorted(input_seq, key = lambda tp: len(tp), reverse=True) + lengths = sorted(lengths, key = lambda tp: tp, reverse=True) + ''' + outputs: + input_seq: [[18, 7, 3, 8, 5, 4], [3, 5, 12, 7, 2], [4, 11, 14]] + lengths : [6, 5, 3] + ''' + + #The padding subscript is 0 + pad_token = 0 + def pad_seq(seq, seq_len, max_length): + seq += [pad_token for _ in range(max_length - seq_len)] + return seq + + #Data after padding + pad_seqs = [] + for i,j in zip(input_seq, lengths): + pad_seqs.append(pad_seq(i, j, max_len)) + + lengths = [6,5,3] + pad_seqs = torch.tensor(pad_seqs) + embeded = embedding(pad_seqs) + embeded = embeded.reshape(6,3,2) + + #cacl cpu + pack = torch.nn.utils.rnn.pack_padded_sequence(embeded, lengths, batch_first=False) + pade_outputs, (hn, cn) = rnn(pack) + pade_outputs, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs, batch_first=False) + + #cacl npu + embeded_npu = embeded.npu() + pack = torch.nn.utils.rnn.pack_padded_sequence(embeded_npu, lengths, batch_first=False) + pade_outputs_npu, (hn_n, cn_n) = rnn_npu(pack) + pade_outputs_npu, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs_npu, batch_first=False) + + self.assertRtolEqual(pade_outputs.detach().numpy(), + pade_outputs_npu.cpu().detach().numpy(), prec=1.e-4) + + def test_lstm_sequence_double_layer(self, device): + max_len = 6 + embedding_size = 2 + hidden_size = 16 + vocab_size = 20 + input_seq = [[3, 5, 12, 7, 2, ], [4, 11, 14, ], [18, 7, 3, 8, 5, 4]] + lengths = [5, 3, 6] + + # embedding + embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=0) + + rnn = torch.nn.LSTM(embedding_size, hidden_size, num_layers=2, bidirectional=False, bias=False) + rnn_npu = copy.deepcopy(rnn).npu() + + iw0 = rnn.weight_ih_l0.split(hidden_size) + hw0 = rnn.weight_hh_l0.split(hidden_size) + iw1 = rnn.weight_ih_l1.split(hidden_size) + hw1 = rnn.weight_hh_l1.split(hidden_size) + iwt0 = torch.cat([iw0[0], iw0[2], iw0[1], iw0[3]], 0) + hwt0 = torch.cat([hw0[0], hw0[2], hw0[1], hw0[3]], 0) + iwt1 = torch.cat([iw1[0], iw1[2], iw1[1], iw1[3]], 0) + hwt1 = torch.cat([hw1[0], hw1[2], hw1[1], hw1[3]], 0) + + rnn.weight_ih_l0.data = iwt0 + rnn.weight_hh_l0.data = hwt0 + rnn.weight_ih_l1.data = iwt1 + rnn.weight_hh_l1.data = hwt1 + + #Sorting from Large to Small + input_seq = sorted(input_seq, key = lambda tp: len(tp), reverse=True) + lengths = sorted(lengths, key = lambda tp: tp, reverse=True) + ''' + outputs: + input_seq: [[18, 7, 3, 8, 5, 4], [3, 5, 12, 7, 2], [4, 11, 14]] + lengths : [6, 5, 3] + ''' + + #The padding subscript is 0 + pad_token = 0 + def pad_seq(seq, seq_len, max_length): + seq += [pad_token for _ in range(max_length - seq_len)] + return seq + + #Data after padding + pad_seqs = [] + for i,j in zip(input_seq, lengths): + pad_seqs.append(pad_seq(i, j, max_len)) + + lengths = [6,5,3] + pad_seqs = torch.tensor(pad_seqs) + embeded = embedding(pad_seqs) + embeded = embeded.reshape(6,3,2) + + #cacl cpu + pack = torch.nn.utils.rnn.pack_padded_sequence(embeded, lengths, batch_first=False) + pade_outputs, (hn, cn) = rnn(pack) + pade_outputs, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs, batch_first=False) + #cacl npu + embeded_npu = embeded.npu() + pack = torch.nn.utils.rnn.pack_padded_sequence(embeded_npu, lengths, batch_first=False) + pade_outputs_npu, (hn_n, cn_n) = rnn_npu(pack) + pade_outputs_npu, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs_npu, batch_first=False) + + self.assertRtolEqual(pade_outputs.detach().numpy(), + pade_outputs_npu.cpu().detach().numpy(), prec=1.e-4) + instantiate_device_type_tests(TestLstm, globals(), except_for='cpu') if __name__ == "__main__": run_tests() diff --git a/test/test_npu/test_network_ops/test_mishbackward.py b/test/test_npu/test_network_ops/test_mish_backward.py similarity index 71% rename from test/test_npu/test_network_ops/test_mishbackward.py rename to test/test_npu/test_network_ops/test_mish_backward.py index 231188abb355c6d809c5bec7c3ed0f486b203f76..1240cb55b3ac30f375b59ba3cca882b1a2a0fd6d 100644 --- a/test/test_npu/test_network_ops/test_mishbackward.py +++ b/test/test_npu/test_network_ops/test_mish_backward.py @@ -31,14 +31,24 @@ class TestMishBackward(TestCase): output_grad = output_grad.detach().numpy() output = output.cpu().detach().numpy() return output_grad, output + + def cpu_op_exec(self, input1): + input1.requires_grad = True + output = input1 * (torch.tanh(F.softplus(input1))) + output.backward(torch.ones_like(output)) + output_grad = input1.grad + output_grad = output_grad.to("cpu") + output_grad = output_grad.detach().numpy() + output = output.detach().numpy() + return output_grad, output def test_mish_fp32(self, device): npu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.]).npu() - ep_output_grad = torch.tensor([1.0490363, 1.0693179, 1.021107, 1.0044329, 1.0008003, 1.0001341, 1.0000216, 1.0000033, 1.0000005, 1.0000001]) - ep_npu_output = torch.tensor([0.8652344, 1.9439697, 2.9865417, 3.9974136, 4.999552, 5.9999266, 6.9999886, 7.999998, 8.999999, 10.]) + cpu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.]) output_grad, npu_output = self.npu_op_exec(npu_input) - self.assertRtolEqual(ep_output_grad.numpy(), output_grad) - self.assertRtolEqual(ep_npu_output.numpy(), npu_output) + ep_output_grad, ep_npu_output = self.cpu_op_exec(cpu_input) + self.assertRtolEqual(ep_output_grad, output_grad) + self.assertRtolEqual(ep_npu_output, npu_output) instantiate_device_type_tests(TestMishBackward, globals(), except_for='cpu') if __name__ == "__main__": diff --git a/test/test_npu/test_network_ops/test_nms_with_mask.py b/test/test_npu/test_network_ops/test_nms_with_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..39ee878cd77f69b10b2c882544839ecc3a4ef533 --- /dev/null +++ b/test/test_npu/test_network_ops/test_nms_with_mask.py @@ -0,0 +1,52 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import sys +import copy +import torch.nn as nn +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + + +class TestNmsWithMask(TestCase): + def npu_op_exec(self, input1, iou_threshold): + npu_output1, npu_output2, npu_output3, = torch.npu_nms_with_mask(input1, iou_threshold) + npu_output1 = npu_output1.to("cpu") + npu_output2 = npu_output2.to("cpu") + npu_output3 = npu_output3.to("cpu") + + return npu_output1, npu_output2, npu_output3 + + def test_nms_with_mask_float32(self, device): + input1 = torch.tensor([[0.0, 1.0, 2.0, 3.0, 0.6], [6.0, 7.0, 8.0, 9.0, 0.4]]).npu() + iou_threshold = 0.5 + + eq_output1 = torch.tensor([[0.0000, 1.0000, 2.0000, 3.0000, 0.6001], + [6.0000, 7.0000, 8.0000, 9.0000, 0.3999]]) + eq_output2 = torch.tensor([0, 1], dtype=torch.int32) + eq_output3 = torch.tensor([1, 1], dtype=torch.uint8) + + npu_output1, npu_output2, npu_output3 = self.npu_op_exec(input1, iou_threshold) + + self.assertRtolEqual(eq_output1, npu_output1) + self.assertRtolEqual(eq_output2, npu_output2) + self.assertRtolEqual(eq_output3, npu_output3) + + +instantiate_device_type_tests(TestNmsWithMask, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/test/test_npu/test_network_ops/test_npu_linear.py b/test/test_npu/test_network_ops/test_npu_linear.py new file mode 100644 index 0000000000000000000000000000000000000000..ea9e7c2e2f507d69f4bcf3446babe2c4141cf6c0 --- /dev/null +++ b/test/test_npu/test_network_ops/test_npu_linear.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import numpy as np +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + +class TestNpuLinear(TestCase): + def cpu_op_exec(self, x, weight, bias): + output = torch.nn.functional.linear(x, weight, bias) + output = output.numpy() + return output + + def npu_op_exec(self, x, weight, bias): + output = torch.npu_linear(x, weight, bias) + output = output.cpu().numpy() + return output + + def test_npu_linear_shape_format_fp32(self, device): + shape_format = [ + [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]], + [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output = self.cpu_op_exec(cpu_x, cpu_w, cpu_b) + npu_output = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output, npu_output, 0.0002) + + def test_npu_linear_shape_format_fp16(self, device): + shape_format = [ + [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]], + [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float()).astype(np.float16) + npu_output = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestNpuLinear, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() + diff --git a/test/test_npu/test_network_ops/test_npu_linear_backward.py b/test/test_npu/test_network_ops/test_npu_linear_backward.py new file mode 100644 index 0000000000000000000000000000000000000000..66f8a47f4143ac56fa0afe457ecbe0f9ebdc9268 --- /dev/null +++ b/test/test_npu/test_network_ops/test_npu_linear_backward.py @@ -0,0 +1,77 @@ +# Copyright (c) 2020, Huawei Technologies.All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import numpy as np +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + +class TestNpuLinearBackward(TestCase): + def cpu_op_exec(self, x, weight, bias): + x.requires_grad = True + weight.requires_grad = True + bias.requires_grad = True + output = torch.nn.functional.linear(x, weight, bias) + loss = output.sum() + loss.backward() + return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy() + + def npu_op_exec(self, x, weight, bias): + x.requires_grad = True + weight.requires_grad = True + bias.requires_grad = True + output = torch.npu_linear(x, weight, bias) + loss = output.sum() + loss.backward() + return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy() + + def test_npu_linear_backward_shape_format_fp32(self, device): + shape_format = [ + [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]], + [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b) + npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output, npu_output, 0.0002) + self.assertRtolEqual(cpu_x_grad, npu_x_grad) + self.assertRtolEqual(cpu_w_grad, npu_w_grad) + self.assertRtolEqual(cpu_b_grad, npu_b_grad) + + def test_npu_linear_shape_format_fp16(self, device): + shape_format = [ + [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]], + [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]], + ] + + for item in shape_format: + cpu_x, npu_x = create_common_tensor(item[0], -2, 2) + cpu_w, npu_w = create_common_tensor(item[1], -2, 2) + cpu_b, npu_b = create_common_tensor(item[2], -2, 2) + cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec( + cpu_x.float(), cpu_w.float(), cpu_b.float()) + npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b) + self.assertRtolEqual(cpu_output.astype(np.float16), npu_output) + self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad) + self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad) + self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad) + +instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu") +if __name__ == "__main__": + run_tests() + diff --git a/test/test_npu/test_network_ops/test_soft_margin_loss.py b/test/test_npu/test_network_ops/test_soft_margin_loss.py index a83172e56db3bd4c6b6247a2b622b1f03bd277c0..9c8308738fee4d74c75dd8bcb07f848c68cc025a 100644 --- a/test/test_npu/test_network_ops/test_soft_margin_loss.py +++ b/test/test_npu/test_network_ops/test_soft_margin_loss.py @@ -92,7 +92,8 @@ class TestSoftMarginLoss(TestCase): self.assertRtolEqual(cpu_output, npu_output) def test_soft_margin_loss_float16_sum(self, device): - npu_input1, npu_input2 = self.generate_data(-2, 2, (37, 8, 20, 20, 5, 8, 10, 8), (37, 8, 20, 20, 1, 1, 1, 1), np.float16) + npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 8, 2, 2, 5, 8, 2, 8), + (1, 8, 2, 2, 1, 1, 1, 1), np.float16) cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum") npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum") self.assertRtolEqual(cpu_output, npu_output) diff --git a/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py b/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py new file mode 100644 index 0000000000000000000000000000000000000000..f75f627040a738f9c4ee208c98458fb6f2966ba2 --- /dev/null +++ b/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py @@ -0,0 +1,91 @@ +# Copyright (c) 2020 Huawei Technologies Co., Ltd +# Copyright (c) 2019, Facebook CORPORATION. +# All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import numpy as np +import sys +import copy +from common_utils import TestCase, run_tests +from common_device_type import dtypes, instantiate_device_type_tests +from util_test import create_common_tensor + +class TestUpsampleBicubic2dBackward(TestCase): + + def cpu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w): + input1.requires_grad = True + output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w) + output.backward(torch.ones_like(output)) + output_grad = input1.grad + output_grad = output_grad.detach().numpy() + return output_grad + + def npu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w): + input1.requires_grad = True + output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w) + output.backward(torch.ones_like(output)) + output_grad = input1.grad + output_grad = output_grad.to("cpu").detach().numpy() + return output_grad + + + def test_upsample_bicubic2d_common_shape_format(self, device): + shape_format = [ + [[np.float32, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255], + [[np.float32, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255], + [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255], + [[np.float32, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255], + [[np.float32, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255], + [[np.float32, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255], + [[np.float32, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255], + [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 3402823500.0] + ] + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6]) + cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3], item[4]) + npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4]) + self.assertRtolEqual(cpu_output, npu_output) + + + def test_upsample_bicubic2d_float16_shape_format(self, device): + def cpu_op_exec_fp16(input1, output_size, align_corners, scale_h, scale_w): + input1 = input1.to(torch.float32) + input1.requires_grad = True + output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w) + output.backward(torch.ones_like(output)) + output_grad = input1.grad + output_grad = output_grad.detach().numpy() + output_grad = output_grad.astype(np.float16) + return output_grad + + shape_format = [ + [[np.float16, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255], + [[np.float16, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255], + [[np.float16, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 6550.0], + [[np.float16, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255], + [[np.float16, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255], + [[np.float16, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255], + [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 6550.0] + ] + + for item in shape_format: + cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6]) + cpu_output = cpu_op_exec_fp16(cpu_input1, item[1], item[2], item[3], item[4]) + npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4]) + self.assertRtolEqual(cpu_output, npu_output) + +instantiate_device_type_tests(TestUpsampleBicubic2dBackward, globals(), except_for='cpu') +if __name__ == "__main__": + run_tests()