diff --git a/patch/npu.patch b/patch/npu.patch
index a46a27034ab9d7433a4e60c9d68d86898a340dab..4e01faf27cccb57db1074605f20a5c9883360123 100644
--- a/patch/npu.patch
+++ b/patch/npu.patch
@@ -1,6 +1,6 @@
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt
 --- pytorch-v1.5.0/aten/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/CMakeLists.txt	2021-07-05 14:59:26.416336304 +0800
++++ pytorch-develop/aten/CMakeLists.txt	2021-07-09 17:16:47.786789915 +0800
 @@ -22,8 +22,10 @@
  set(ATen_CPU_INCLUDE)
  set(ATen_THIRD_PARTY_INCLUDE)
@@ -51,7 +51,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-05 14:59:26.416336304 +0800
++++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-09 17:16:47.786789915 +0800
 @@ -67,6 +67,9 @@
  FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h")
  FILE(GLOB native_cpu_h "native/cpu/*.h")
@@ -129,7 +129,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h
 --- pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-05 14:59:26.424336365 +0800
++++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-09 17:16:47.794790202 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -170,7 +170,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py
 --- pytorch-v1.5.0/aten/src/ATen/function_wrapper.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-05 14:59:26.432336426 +0800
++++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-09 17:16:47.802790488 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -355,7 +355,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          for option in declaration['options']:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py
 --- pytorch-v1.5.0/aten/src/ATen/gen.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/gen.py	2021-07-05 14:59:26.432336426 +0800
++++ pytorch-develop/aten/src/ATen/gen.py	2021-07-09 17:16:47.802790488 +0800
 @@ -1,3 +1,18 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -513,7 +513,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      generate_outputs()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-05 14:59:26.444336518 +0800
++++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-09 17:16:47.814790918 +0800
 @@ -339,20 +339,20 @@
  
  void hardsigmoid_backward_kernel(TensorIterator& iter) {
@@ -541,7 +541,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    });
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-05 14:59:26.440336488 +0800
++++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-09 17:16:47.806790632 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -596,7 +596,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        detail::computeStorageSize(self.sizes(), self.strides()),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml
 --- pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-05 14:59:26.460336640 +0800
++++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-09 17:16:47.830791493 +0800
 @@ -1,6 +1,5 @@
  # See README.md in this directory for more guidance
  
@@ -2324,16 +2324,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
    requires_tensor: True
-@@ -1801,6 +2302,8 @@
-   requires_tensor: True
-   dispatch:
-     QuantizedCPU: quantized_max_pool2d
-+  npu_dispatch:
-+    NPU: quantized_max_pool2d_npu
- 
- - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
-   supports_named_tensor: True
-@@ -1814,6 +2317,8 @@
+@@ -1814,6 +2315,8 @@
      CPU: mean_cpu_gpu
      CUDA: mean_cpu_gpu
      QuantizedCPU: quantized_mean_cpu
@@ -2342,7 +2333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
    variants: function, method
-@@ -1822,6 +2327,8 @@
+@@ -1822,6 +2325,8 @@
      CPU: mean_cpu_gpu
      CUDA: mean_cpu_gpu
      QuantizedCPU: quantized_mean_cpu
@@ -2351,7 +2342,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -1829,47 +2336,73 @@
+@@ -1829,47 +2334,73 @@
      CPU: mean_out_cpu_gpu
      CUDA: mean_out_cpu_gpu
      QuantizedCPU: quantized_mean_out_cpu
@@ -2425,7 +2416,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
  
-@@ -1958,6 +2491,8 @@
+@@ -1958,6 +2489,8 @@
      CUDA: legacy::cuda::_th_mm
      SparseCPU: _sparse_mm
      SparseCUDA: _sparse_mm
@@ -2434,7 +2425,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
-@@ -1966,6 +2501,8 @@
+@@ -1966,6 +2499,8 @@
      CUDA: legacy::cuda::_th_mm_out
      SparseCPU: _sparse_mm_out
      SparseCUDA: _sparse_mm_out
@@ -2443,21 +2434,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
-@@ -1974,9 +2511,13 @@
- - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
-   supports_named_tensor: True
-   variants: function, method
-+  npu_dispatch:
-+    NPU: mode_npu
- 
- - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
-   supports_named_tensor: True
-+  npu_dispatch:
-+    NPU: mode_out_npu
- 
- - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-   variants: function, method
-@@ -1994,6 +2535,8 @@
+@@ -1994,6 +2529,8 @@
      SparseCPU: mul_sparse
      SparseCUDA: mul_sparse
      MkldnnCPU: mkldnn_mul
@@ -2466,7 +2443,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-@@ -2004,6 +2547,8 @@
+@@ -2004,6 +2541,8 @@
      SparseCPU: mul_sparse_
      SparseCUDA: mul_sparse_
      MkldnnCPU: mkldnn_mul_
@@ -2475,7 +2452,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-@@ -2013,15 +2558,21 @@
+@@ -2013,15 +2552,21 @@
      SparseCPU: mul_out_sparse_cpu
      SparseCUDA: mul_out_sparse_cuda
      MkldnnCPU: mkldnn_mul_out
@@ -2497,7 +2474,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mv(Tensor self, Tensor vec) -> Tensor
    use_c10_dispatcher: full
-@@ -2030,12 +2581,16 @@
+@@ -2030,12 +2575,16 @@
      CPU: mv_cpu
      CUDA: legacy::cuda::_th_mv
    supports_named_tensor: True
@@ -2514,7 +2491,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mvlgamma(Tensor self, int p) -> Tensor
    use_c10_dispatcher: full
-@@ -2052,6 +2607,8 @@
+@@ -2052,6 +2601,8 @@
      CUDA: narrow_copy_dense
      SparseCPU: narrow_copy_sparse
      SparseCUDA: narrow_copy_sparse
@@ -2523,7 +2500,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
    variants: function, method
-@@ -2068,6 +2625,8 @@
+@@ -2068,6 +2619,8 @@
      CPU: batch_norm_cpu
      CUDA: batch_norm_cuda
      MkldnnCPU: mkldnn_batch_norm
@@ -2532,7 +2509,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    dispatch:
-@@ -2098,6 +2657,8 @@
+@@ -2098,6 +2651,8 @@
    dispatch:
      CPU: batch_norm_backward_cpu
      CUDA: batch_norm_backward_cuda
@@ -2541,7 +2518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
    dispatch:
-@@ -2117,6 +2678,8 @@
+@@ -2117,6 +2672,8 @@
  
  - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
    variants: function
@@ -2550,7 +2527,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
    variants: function
-@@ -2129,42 +2692,60 @@
+@@ -2129,42 +2686,60 @@
  
  - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    device_guard: False
@@ -2613,7 +2590,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Only exposed from C++ -- in Python,
  # we expose it as an attribute `T`, not a function.
-@@ -2253,54 +2834,82 @@
+@@ -2253,54 +2828,82 @@
    supports_named_tensor: True
  
  - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2697,7 +2674,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor
    use_c10_dispatcher: full
-@@ -2316,6 +2925,8 @@
+@@ -2316,6 +2919,8 @@
  - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -2706,7 +2683,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: reshape(Tensor self, int[] shape) -> Tensor
    variants: function, method
-@@ -2337,16 +2948,22 @@
+@@ -2337,16 +2942,22 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -2729,7 +2706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
  
-@@ -2360,6 +2977,8 @@
+@@ -2360,6 +2971,8 @@
      CUDA: relu
      MkldnnCPU: mkldnn_relu
      QuantizedCPU: quantized_relu
@@ -2738,7 +2715,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: relu_(Tensor(a!) self) -> Tensor(a!)
-@@ -2370,6 +2989,8 @@
+@@ -2370,6 +2983,8 @@
      CUDA: relu_
      MkldnnCPU: mkldnn_relu_
      QuantizedCPU: quantized_relu_
@@ -2747,7 +2724,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: prelu(Tensor self, Tensor weight) -> Tensor
    use_c10_dispatcher: full
-@@ -2377,12 +2998,16 @@
+@@ -2377,12 +2992,16 @@
    dispatch:
      CPU: prelu_cpu
      CUDA: prelu_cuda
@@ -2764,7 +2741,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gelu(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2390,6 +3015,8 @@
+@@ -2390,6 +3009,8 @@
    dispatch:
      CPU: gelu_cpu
      CUDA: gelu_cuda
@@ -2773,7 +2750,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2397,29 +3024,41 @@
+@@ -2397,29 +3018,41 @@
    dispatch:
      CPU: gelu_backward_cpu
      CUDA: gelu_backward_cuda
@@ -2815,7 +2792,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
    variants: function, method
-@@ -2433,14 +3072,21 @@
+@@ -2433,14 +3066,21 @@
  
  - func: selu(Tensor self) -> Tensor
    use_c10_dispatcher: full
@@ -2838,7 +2815,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sigmoid(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2451,6 +3097,8 @@
+@@ -2451,6 +3091,8 @@
      CUDA: sigmoid
      QuantizedCPU: quantized_sigmoid
      MkldnnCPU: mkldnn_sigmoid
@@ -2847,7 +2824,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2459,36 +3107,52 @@
+@@ -2459,36 +3101,52 @@
      CPU: sigmoid_
      CUDA: sigmoid_
      MkldnnCPU: mkldnn_sigmoid_
@@ -2900,7 +2877,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Returns a copy of this `Variable` that is detached from its autograd graph.
  # This method is OK to call if the `Variable` is a view.
-@@ -2533,6 +3197,8 @@
+@@ -2533,6 +3191,8 @@
  
  - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
    variants: function, method
@@ -2909,7 +2886,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: smm(Tensor self, Tensor mat2) -> Tensor
    use_c10_dispatcher: full
-@@ -2542,10 +3208,14 @@
+@@ -2542,10 +3202,14 @@
  - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
    variants: function, method
    supports_named_tensor: True
@@ -2924,7 +2901,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
    use_c10_dispatcher: full
-@@ -2553,12 +3223,16 @@
+@@ -2553,12 +3217,16 @@
      CPU: softmax_cpu
      CUDA: softmax_cuda
      MkldnnCPU: mkldnn_softmax
@@ -2941,7 +2918,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
    variants: function, method
-@@ -2609,8 +3283,12 @@
+@@ -2609,8 +3277,12 @@
      SparseCUDA: _sspaddmm_out_cuda
  
  - func: stack(Tensor[] tensors, int dim=0) -> Tensor
@@ -2954,7 +2931,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # The signature is designed to be consistent with librosa except that it is
  # missing the `pad_mode` and `center` arguments, which are taken care of at
-@@ -2633,20 +3311,30 @@
+@@ -2633,20 +3305,30 @@
  - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
    variants: function, method
    supports_named_tensor: True
@@ -2985,7 +2962,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sum_to_size(Tensor self, int[] size) -> Tensor
    variants: method
-@@ -2656,13 +3344,19 @@
+@@ -2656,13 +3338,19 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -3005,7 +2982,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: square(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2677,51 +3371,81 @@
+@@ -2677,51 +3365,81 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3088,7 +3065,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: t(Tensor(a) self) -> Tensor(a)
    device_guard: False
-@@ -2736,6 +3460,8 @@
+@@ -2736,6 +3454,8 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -3097,7 +3074,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tan_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2743,12 +3469,16 @@
+@@ -2743,12 +3463,16 @@
    dispatch:
      CPU: _tan__cpu
      CUDA: _tan__cuda
@@ -3114,7 +3091,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tanh(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2758,6 +3488,8 @@
+@@ -2758,6 +3482,8 @@
      CPU: tanh
      CUDA: tanh
      QuantizedCPU: quantized_tanh
@@ -3123,7 +3100,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tanh_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2765,12 +3497,16 @@
+@@ -2765,12 +3491,16 @@
    dispatch:
      CPU: _tanh__cpu
      CUDA: _tanh__cuda
@@ -3140,7 +3117,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
    variants: function
-@@ -2783,6 +3519,8 @@
+@@ -2783,6 +3513,8 @@
    dispatch:
      CPU: threshold
      CUDA: threshold_cuda
@@ -3149,7 +3126,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
    variants: function
-@@ -2790,12 +3528,16 @@
+@@ -2790,12 +3522,16 @@
    dispatch:
      CPU: threshold_
      CUDA: threshold__cuda
@@ -3166,7 +3143,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
    use_c10_dispatcher: full
-@@ -2803,6 +3545,8 @@
+@@ -2803,6 +3539,8 @@
    dispatch:
      CPU: threshold_backward
      CUDA: threshold_backward_cuda
@@ -3175,7 +3152,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
    variants: function, method
-@@ -2835,18 +3579,24 @@
+@@ -2835,18 +3573,24 @@
    use_c10_dispatcher: full
    python_module: nn
    variants: function
@@ -3200,7 +3177,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args
  
-@@ -2872,6 +3622,8 @@
+@@ -2872,6 +3616,8 @@
      CUDA: true_divide
      SparseCPU: true_divide_sparse
      SparseCUDA: true_divide_sparse
@@ -3209,7 +3186,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-@@ -2881,6 +3633,8 @@
+@@ -2881,6 +3627,8 @@
      CUDA: true_divide_
      SparseCPU: true_divide_sparse_
      SparseCUDA: true_divide_sparse_
@@ -3218,7 +3195,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-@@ -2889,31 +3643,43 @@
+@@ -2889,31 +3637,43 @@
      CUDA: true_divide_out
      SparseCPU: true_divide_out_sparse_zerodim
      SparseCUDA: true_divide_out_sparse_zerodim
@@ -3262,7 +3239,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: type_as(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -2956,6 +3722,8 @@
+@@ -2956,6 +3716,8 @@
    dispatch:
      CPU: _unique2_cpu
      CUDA: _unique2_cuda
@@ -3271,7 +3248,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _unsafe_view(Tensor self, int[] size) -> Tensor
  
-@@ -2971,32 +3739,48 @@
+@@ -2971,32 +3733,48 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3320,7 +3297,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: view_as(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -3009,13 +3793,19 @@
+@@ -3009,13 +3787,19 @@
  - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -3340,7 +3317,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
    variants: function
-@@ -3041,13 +3831,21 @@
+@@ -3041,13 +3825,21 @@
  
  - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    device_guard: False
@@ -3362,7 +3339,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
    use_c10_dispatcher: full
-@@ -3100,25 +3898,37 @@
+@@ -3100,25 +3892,37 @@
  
  - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
    dispatch:
@@ -3402,7 +3379,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
    variants: function, method
-@@ -3162,12 +3972,16 @@
+@@ -3162,12 +3966,16 @@
      SparseCUDA: clone_sparse
      MkldnnCPU: mkldnn_clone
      QuantizedCPU: quantized_clone
@@ -3419,7 +3396,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -3176,6 +3990,8 @@
+@@ -3176,6 +3984,8 @@
      CUDA: pow_out
      SparseCPU: pow_out_sparse_scalar
      SparseCUDA: pow_out_sparse_scalar
@@ -3428,7 +3405,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -3186,6 +4002,8 @@
+@@ -3186,6 +3996,8 @@
      CUDA: pow
      SparseCPU: pow_sparse_scalar
      SparseCUDA: pow_sparse_scalar
@@ -3437,7 +3414,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: zero_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -3196,6 +4014,14 @@
+@@ -3196,6 +4008,14 @@
      SparseCPU: zero_sparse_
      SparseCUDA: zero_sparse_
      MkldnnCPU: mkldnn_zero_
@@ -3452,7 +3429,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
    dispatch:
-@@ -3204,6 +4030,8 @@
+@@ -3204,6 +4024,8 @@
      SparseCPU: sub_out_sparse
      SparseCUDA: sub_out_sparse
    supports_named_tensor: True
@@ -3461,7 +3438,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
    use_c10_dispatcher: full
-@@ -3213,6 +4041,8 @@
+@@ -3213,6 +4035,8 @@
      CUDA: sub
      SparseCPU: sub_sparse
      SparseCUDA: sub_sparse
@@ -3470,7 +3447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-@@ -3222,6 +4052,8 @@
+@@ -3222,6 +4046,8 @@
      CUDA: sub_
      SparseCPU: sub_sparse_
      SparseCUDA: sub_sparse_
@@ -3479,7 +3456,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  # For C++ only, until we have conversion from C++ numbers to Tensor
-@@ -3229,21 +4061,29 @@
+@@ -3229,21 +4055,29 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3509,7 +3486,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Functionally the same as addmm, but we give it a different derivative formula
  # that doesn't propagate gradients to non-present entries on sparse.
-@@ -3257,6 +4097,8 @@
+@@ -3257,6 +4091,8 @@
      CUDA: legacy::cuda::_th_addmm_out
      SparseCPU: addmm_out_sparse_dense_cpu
      SparseCUDA: addmm_out_sparse_dense_cuda
@@ -3518,7 +3495,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-@@ -3267,6 +4109,8 @@
+@@ -3267,6 +4103,8 @@
      CUDA: legacy::cuda::_th_addmm
      SparseCPU: addmm_sparse_dense_cpu
      SparseCUDA: addmm_sparse_dense_cuda
@@ -3527,7 +3504,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-@@ -3278,9 +4122,10 @@
+@@ -3278,9 +4116,10 @@
      # broadcasting
      SparseCPU: s_addmm_sparse_dense_cpu_
      SparseCUDA: s_addmm_sparse_dense_cuda_
@@ -3539,7 +3516,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # NOTE [ Sparse: autograd and API ]
  #
  #
-@@ -3396,7 +4241,6 @@
+@@ -3396,7 +4235,6 @@
  # shared. In other words, their outputs are non-differentiable views of the
  # sparse tensor.
  
@@ -3547,7 +3524,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
  # the default would never make sense.
  - func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
-@@ -3433,7 +4277,6 @@
+@@ -3433,7 +4271,6 @@
      SparseCUDA: sparse_resize_and_clear_
    requires_tensor: True
  
@@ -3555,7 +3532,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3442,7 +4285,6 @@
+@@ -3442,7 +4279,6 @@
      SparseCUDA: sparse_mask_cuda
    requires_tensor: True
  
@@ -3563,7 +3540,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: to_dense(Tensor self) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3474,7 +4316,6 @@
+@@ -3474,7 +4310,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3571,7 +3548,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: dense_dim(Tensor self) -> int
    use_c10_dispatcher: full
    variants: method
-@@ -3494,7 +4335,6 @@
+@@ -3494,7 +4329,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3579,7 +3556,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: _nnz(Tensor self) -> int
    use_c10_dispatcher: full
    variants: method
-@@ -3504,7 +4344,6 @@
+@@ -3504,7 +4338,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3587,7 +3564,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: coalesce(Tensor self) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3513,7 +4352,6 @@
+@@ -3513,7 +4346,6 @@
      SparseCUDA: coalesce_sparse_cuda
    requires_tensor: True
  
@@ -3595,7 +3572,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: is_coalesced(Tensor self) -> bool
    use_c10_dispatcher: full
    variants: method
-@@ -3524,7 +4362,6 @@
+@@ -3524,7 +4356,6 @@
    device_guard: False
    supports_named_tensor: True
  
@@ -3603,7 +3580,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: _indices(Tensor(a) self) -> Tensor(a)
    variants: method
    dispatch:
-@@ -3568,7 +4405,6 @@
+@@ -3568,7 +4399,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3611,7 +3588,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
    dispatch:
      SparseCPU: hspmm_out_sparse_cpu
-@@ -3630,11 +4466,15 @@
+@@ -3630,11 +4460,15 @@
    variants: function
    dispatch:
      CPU: quantize_per_tensor_cpu
@@ -3627,7 +3604,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: dequantize(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -3713,20 +4553,28 @@
+@@ -3713,20 +4547,28 @@
    variants: method
    device_guard: False
    supports_named_tensor: True
@@ -3656,7 +3633,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: meshgrid(Tensor[] tensors) -> Tensor[]
  
-@@ -3765,6 +4613,8 @@
+@@ -3765,6 +4607,8 @@
    dispatch:
      CPU: _local_scalar_dense_cpu
      CUDA: _local_scalar_dense_cuda
@@ -3665,7 +3642,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    variants: function
    supports_named_tensor: True
  
-@@ -3791,10 +4641,16 @@
+@@ -3791,10 +4635,16 @@
  
  # RNN cells and layers
  - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
@@ -3682,7 +3659,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  
-@@ -3839,10 +4695,14 @@
+@@ -3839,10 +4689,14 @@
  
  # PackedSequence utilities
  - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
@@ -3697,7 +3674,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # wrappers for legacy TH methods
  
-@@ -3852,6 +4712,8 @@
+@@ -3852,6 +4706,8 @@
    dispatch:
      CPU: set_
      CUDA: set_
@@ -3706,7 +3683,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
    variants: method
-@@ -3860,6 +4722,8 @@
+@@ -3860,6 +4716,8 @@
      CPU: legacy::cpu::_th_set_
      CUDA: legacy::cuda::_th_set_
      QuantizedCPU: set_storage
@@ -3715,7 +3692,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
    variants: method
-@@ -3867,12 +4731,16 @@
+@@ -3867,12 +4725,16 @@
    dispatch:
      CPU: set_tensor_
      CUDA: set_tensor_
@@ -3732,7 +3709,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!)
    variants: method
-@@ -3892,6 +4760,8 @@
+@@ -3892,6 +4754,8 @@
    dispatch:
      CPU: masked_fill__cpu
      CUDA: masked_fill__cuda
@@ -3741,7 +3718,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
-@@ -3904,6 +4774,8 @@
+@@ -3904,6 +4768,8 @@
    dispatch:
      CPU: masked_fill__cpu
      CUDA: masked_fill__cuda
@@ -3750,7 +3727,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
-@@ -3916,6 +4788,8 @@
+@@ -3916,6 +4782,8 @@
    dispatch:
      CPU: masked_scatter__cpu
      CUDA: masked_scatter__cuda
@@ -3759,7 +3736,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
    use_c10_dispatcher: full
-@@ -3929,25 +4803,35 @@
+@@ -3929,25 +4797,35 @@
      CUDA: view
      MkldnnCPU: mkldnn_view
      QuantizedCPU: view
@@ -3795,7 +3772,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
    variants: method
-@@ -3955,11 +4839,15 @@
+@@ -3955,11 +4833,15 @@
    dispatch:
      CPU: legacy::cpu::_th_index_fill_
      CUDA: legacy::cuda::_th_index_fill_
@@ -3811,7 +3788,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
    variants: method
-@@ -3967,11 +4855,15 @@
+@@ -3967,11 +4849,15 @@
      CPU: index_fill_
      CUDA: index_fill_
    supports_named_tensor: True
@@ -3827,7 +3804,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
    variants: method
-@@ -3994,6 +4886,8 @@
+@@ -3994,6 +4880,8 @@
    dispatch:
      CPU: scatter_cpu_
      CUDA: legacy::cuda::_th_scatter_
@@ -3836,7 +3813,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
    use_c10_dispatcher: full
-@@ -4004,6 +4898,8 @@
+@@ -4004,6 +4892,8 @@
    dispatch:
      CPU: scatter_fill_cpu_
      CUDA: legacy::cuda::_th_scatter_
@@ -3845,7 +3822,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
    use_c10_dispatcher: full
-@@ -4020,81 +4916,127 @@
+@@ -4020,81 +4910,127 @@
    dispatch:
      CPU: scatter_add_cpu_
      CUDA: legacy::cuda::_th_scatter_add_
@@ -3973,7 +3950,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
    variants: method
-@@ -4107,70 +5049,106 @@
+@@ -4107,70 +5043,106 @@
    dispatch:
      CPU: bitwise_or_out
      CUDA: bitwise_or_out
@@ -4080,7 +4057,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
    variants: method
-@@ -4240,18 +5218,24 @@
+@@ -4240,18 +5212,24 @@
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
    supports_named_tensor: True
    variants: method
@@ -4105,7 +4082,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: digamma_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4266,6 +5250,8 @@
+@@ -4266,6 +5244,8 @@
    dispatch:
      CPU: legacy::cpu::_th_renorm_
      CUDA: legacy::cuda::_th_renorm_
@@ -4114,7 +4091,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4273,6 +5259,8 @@
+@@ -4273,6 +5253,8 @@
    dispatch:
      CPU: pow_
      CUDA: pow_
@@ -4123,7 +4100,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4280,53 +5268,71 @@
+@@ -4280,53 +5262,71 @@
    dispatch:
      CPU: pow_
      CUDA: pow_
@@ -4195,7 +4172,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
    use_c10_dispatcher: full
-@@ -4334,28 +5340,40 @@
+@@ -4334,28 +5334,40 @@
    dispatch:
      CPU: legacy::cpu::_th_addbmm
      CUDA: legacy::cuda::_th_addbmm
@@ -4236,7 +4213,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
-@@ -4380,6 +5398,8 @@
+@@ -4380,6 +5392,8 @@
    dispatch:
      CPU: legacy::cpu::_th_diag_out
      CUDA: legacy::cuda::_th_diag_out
@@ -4245,7 +4222,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: diag(Tensor self, int diagonal=0) -> Tensor
    use_c10_dispatcher: full
-@@ -4387,30 +5407,44 @@
+@@ -4387,30 +5401,44 @@
    dispatch:
      CPU: legacy::cpu::_th_diag
      CUDA: legacy::cuda::_th_diag
@@ -4290,7 +4267,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    dispatch:
-@@ -4435,6 +5469,8 @@
+@@ -4435,6 +5463,8 @@
      CPU: ne_out
      CUDA: ne_out
      QuantizedCPU: ne_out_quantized_cpu
@@ -4299,7 +4276,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4444,6 +5480,8 @@
+@@ -4444,6 +5474,8 @@
      CPU: ne
      CUDA: ne
      QuantizedCPU: ne_quantized_cpu
@@ -4308,7 +4285,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4451,6 +5489,8 @@
+@@ -4451,6 +5483,8 @@
      CPU: ne_out
      CUDA: ne_out
      QuantizedCPU: ne_out_quantized_cpu
@@ -4317,7 +4294,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4460,6 +5500,8 @@
+@@ -4460,6 +5494,8 @@
      CPU: ne
      CUDA: ne
      QuantizedCPU: ne_quantized_cpu
@@ -4326,7 +4303,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4467,6 +5509,8 @@
+@@ -4467,6 +5503,8 @@
      CPU: eq_out
      CUDA: eq_out
      QuantizedCPU: eq_out_quantized_cpu
@@ -4335,7 +4312,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4476,6 +5520,8 @@
+@@ -4476,6 +5514,8 @@
      CPU: eq
      CUDA: eq
      QuantizedCPU: eq_quantized_cpu
@@ -4344,7 +4321,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4483,6 +5529,8 @@
+@@ -4483,6 +5523,8 @@
      CPU: eq_out
      CUDA: eq_out
      QuantizedCPU: eq_out_quantized_cpu
@@ -4353,7 +4330,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4492,6 +5540,8 @@
+@@ -4492,6 +5534,8 @@
      CPU: eq
      CUDA: eq
      QuantizedCPU: eq_quantized_cpu
@@ -4362,7 +4339,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4499,6 +5549,8 @@
+@@ -4499,6 +5543,8 @@
      CPU: ge_out
      CUDA: ge_out
      QuantizedCPU: ge_out_quantized_cpu
@@ -4371,7 +4348,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4508,6 +5560,8 @@
+@@ -4508,6 +5554,8 @@
      CPU: ge
      CUDA: ge
      QuantizedCPU: ge_quantized_cpu
@@ -4380,7 +4357,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4515,6 +5569,8 @@
+@@ -4515,6 +5563,8 @@
      CPU: ge_out
      CUDA: ge_out
      QuantizedCPU: ge_out_quantized_cpu
@@ -4389,7 +4366,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4524,6 +5580,8 @@
+@@ -4524,6 +5574,8 @@
      CPU: ge
      CUDA: ge
      QuantizedCPU: ge_quantized_cpu
@@ -4398,7 +4375,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4531,6 +5589,8 @@
+@@ -4531,6 +5583,8 @@
      CPU: le_out
      CUDA: le_out
      QuantizedCPU: le_out_quantized_cpu
@@ -4407,7 +4384,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4540,6 +5600,8 @@
+@@ -4540,6 +5594,8 @@
      CPU: le
      CUDA: le
      QuantizedCPU: le_quantized_cpu
@@ -4416,7 +4393,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4547,6 +5609,8 @@
+@@ -4547,6 +5603,8 @@
      CPU: le_out
      CUDA: le_out
      QuantizedCPU: le_out_quantized_cpu
@@ -4425,7 +4402,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4556,6 +5620,8 @@
+@@ -4556,6 +5614,8 @@
      CPU: le
      CUDA: le
      QuantizedCPU: le_quantized_cpu
@@ -4434,7 +4411,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4563,6 +5629,8 @@
+@@ -4563,6 +5623,8 @@
      CPU: gt_out
      CUDA: gt_out
      QuantizedCPU: gt_out_quantized_cpu
@@ -4443,7 +4420,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4572,6 +5640,8 @@
+@@ -4572,6 +5634,8 @@
      CPU: gt
      CUDA: gt
      QuantizedCPU: gt_quantized_cpu
@@ -4452,7 +4429,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4579,6 +5649,8 @@
+@@ -4579,6 +5643,8 @@
      CPU: gt_out
      CUDA: gt_out
      QuantizedCPU: gt_out_quantized_cpu
@@ -4461,7 +4438,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4588,6 +5660,8 @@
+@@ -4588,6 +5654,8 @@
      CPU: gt
      CUDA: gt
      QuantizedCPU: gt_quantized_cpu
@@ -4470,7 +4447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4595,6 +5669,8 @@
+@@ -4595,6 +5663,8 @@
      CPU: lt_out
      CUDA: lt_out
      QuantizedCPU: lt_out_quantized_cpu
@@ -4479,7 +4456,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4604,6 +5680,8 @@
+@@ -4604,6 +5674,8 @@
      CPU: lt
      CUDA: lt
      QuantizedCPU: lt_quantized_cpu
@@ -4488,7 +4465,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4611,6 +5689,8 @@
+@@ -4611,6 +5683,8 @@
      CPU: lt_out
      CUDA: lt_out
      QuantizedCPU: lt_out_quantized_cpu
@@ -4497,7 +4474,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4620,11 +5700,16 @@
+@@ -4620,11 +5694,16 @@
      CPU: lt
      CUDA: lt
      QuantizedCPU: lt_quantized_cpu
@@ -4514,7 +4491,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: take(Tensor self, Tensor index) -> Tensor
    use_c10_dispatcher: full
-@@ -4632,11 +5717,16 @@
+@@ -4632,11 +5711,16 @@
    dispatch:
      CPU: legacy::cpu::_th_take
      CUDA: legacy::cuda::_th_take
@@ -4531,7 +4508,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
    use_c10_dispatcher: full
-@@ -4646,17 +5736,25 @@
+@@ -4646,17 +5730,25 @@
      CUDA: legacy::cuda::_th_index_select
      SparseCPU: index_select_sparse
      SparseCUDA: index_select_sparse
@@ -4557,7 +4534,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: masked_select(Tensor self, Tensor mask) -> Tensor
    use_c10_dispatcher: full
-@@ -4665,11 +5763,15 @@
+@@ -4665,11 +5757,15 @@
      CPU: masked_select_cpu
      CUDA: masked_select_cuda
    supports_named_tensor: True
@@ -4573,7 +4550,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: nonzero(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -4677,6 +5779,8 @@
+@@ -4677,6 +5773,8 @@
    dispatch:
      CPU: legacy::cpu::_th_nonzero
      CUDA: legacy::cuda::_th_nonzero
@@ -4582,7 +4559,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: nonzero_numpy(Tensor self) -> Tensor[]
    variants: method, function
-@@ -4685,6 +5789,8 @@
+@@ -4685,6 +5783,8 @@
    dispatch:
      CPU: gather_out_cpu
      CUDA: gather_out_cuda
@@ -4591,7 +4568,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
    use_c10_dispatcher: full
-@@ -4692,34 +5798,50 @@
+@@ -4692,34 +5792,50 @@
    dispatch:
      CPU: gather_cpu
      CUDA: gather_cuda
@@ -4642,7 +4619,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
    dispatch:
-@@ -4826,9 +5948,13 @@
+@@ -4826,9 +5942,13 @@
      CUDA: legacy::cuda::_th_potri
  
  - func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
@@ -4656,7 +4633,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _qr_helper(Tensor self, bool some) -> (Tensor, Tensor)
    variants: function
-@@ -4891,12 +6017,16 @@
+@@ -4891,12 +6011,16 @@
    dispatch:
      CPU: multinomial_out
      CUDA: multinomial_out
@@ -4673,7 +4650,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor)
    variants: function
-@@ -4947,6 +6077,8 @@
+@@ -4947,6 +6071,8 @@
    dispatch:
      CPU: erfinv
      CUDA: erfinv
@@ -4682,7 +4659,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4954,26 +6086,36 @@
+@@ -4954,26 +6080,36 @@
    dispatch:
      CPU: _erfinv__cpu
      CUDA: _erfinv__cuda
@@ -4719,7 +4696,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
    use_c10_dispatcher: full
-@@ -4981,21 +6123,29 @@
+@@ -4981,21 +6117,29 @@
  
  - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
@@ -4749,7 +4726,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
    use_c10_dispatcher: full
-@@ -5003,6 +6153,8 @@
+@@ -5003,6 +6147,8 @@
    dispatch:
      CPU: lerp_cpu_scalar
      CUDA: lerp_cuda_scalar
@@ -4758,7 +4735,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
    use_c10_dispatcher: full
-@@ -5010,11 +6162,15 @@
+@@ -5010,6 +6156,8 @@
    dispatch:
      CPU: lerp_cpu_tensor
      CUDA: lerp_cuda_tensor
@@ -4767,21 +4744,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
    dispatch:
-     CPU: legacy::cpu::_th_histc_out
-     CUDA: _histc_out_cuda
-+  npu_dispatch:
-+    NPU: histc_out_npu
- 
- - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
-   use_c10_dispatcher: full
-@@ -5022,11 +6178,15 @@
-   dispatch:
-     CPU: legacy::cpu::_th_histc
-     CUDA: _histc_cuda
-+  npu_dispatch:
-+    NPU: histc_npu
- 
- - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+@@ -5027,6 +6175,8 @@
    dispatch:
      CPU: fmod_out
      CUDA: legacy::cuda::_th_fmod_out
@@ -4790,7 +4753,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
    use_c10_dispatcher: full
-@@ -5034,11 +6194,15 @@
+@@ -5034,11 +6184,15 @@
    dispatch:
      CPU: fmod
      CUDA: legacy::cuda::_th_fmod
@@ -4806,7 +4769,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -5046,11 +6210,15 @@
+@@ -5046,11 +6200,15 @@
    dispatch:
      CPU: fmod
      CUDA: legacy::cuda::_th_fmod
@@ -4822,7 +4785,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
    use_c10_dispatcher: full
-@@ -5058,11 +6226,15 @@
+@@ -5058,11 +6216,15 @@
    dispatch:
      CPU: remainder
      CUDA: remainder
@@ -4838,7 +4801,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -5070,12 +6242,18 @@
+@@ -5070,12 +6232,18 @@
    dispatch:
      CPU: remainder
      CUDA: remainder
@@ -4857,7 +4820,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: min(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5084,13 +6262,19 @@
+@@ -5084,13 +6252,19 @@
      CPU: min
      CUDA: legacy::cuda::_th_min
      QuantizedCPU: min_quant
@@ -4877,7 +4840,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5099,6 +6283,8 @@
+@@ -5099,6 +6273,8 @@
      CPU: max
      CUDA: legacy::cuda::_th_max
      QuantizedCPU: max_quant
@@ -4886,7 +4849,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: median(Tensor self) -> Tensor
-@@ -5107,12 +6293,16 @@
+@@ -5107,12 +6283,16 @@
    dispatch:
      CPU: median_cpu
      CUDA: median_cuda
@@ -4903,7 +4866,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
    variants: method, function
-@@ -5120,23 +6310,45 @@
+@@ -5120,23 +6300,45 @@
      CPU: legacy::cpu::_th_sort
      CUDA: legacy::cuda::_th_sort
      QuantizedCPU: sort_quant
@@ -4949,7 +4912,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
    variants: method, function
-@@ -5144,11 +6356,15 @@
+@@ -5144,11 +6346,15 @@
      CPU: topk
      CUDA: topk
      QuantizedCPU: quantized_topk_cpu
@@ -4965,7 +4928,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: any(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5159,11 +6375,15 @@
+@@ -5159,11 +6365,15 @@
      CUDA: any
      SparseCPU: any_sparse
      SparseCUDA: any_sparse
@@ -4981,7 +4944,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
    use_c10_dispatcher: full
-@@ -5171,6 +6391,8 @@
+@@ -5171,6 +6381,8 @@
    dispatch:
      CPU: legacy::cpu::_th_renorm
      CUDA: legacy::cuda::_th_renorm
@@ -4990,7 +4953,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
    variants: method
-@@ -5178,6 +6400,8 @@
+@@ -5178,6 +6390,8 @@
    dispatch:
      CPU: unfold
      CUDA: unfold
@@ -4999,7 +4962,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: equal(Tensor self, Tensor other) -> bool
    use_c10_dispatcher: full
-@@ -5186,6 +6410,8 @@
+@@ -5186,6 +6400,8 @@
      CPU: legacy::cpu::_th_equal
      CUDA: legacy::cuda::_th_equal
      QuantizedCPU: quantized_equal
@@ -5008,7 +4971,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
-@@ -5193,6 +6419,8 @@
+@@ -5193,6 +6409,8 @@
    dispatch:
      CPU: pow_out
      CUDA: pow_out
@@ -5017,7 +4980,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -5201,12 +6429,16 @@
+@@ -5201,12 +6419,16 @@
    dispatch:
      CPU: pow
      CUDA: pow
@@ -5034,7 +4997,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -5214,6 +6446,8 @@
+@@ -5214,6 +6436,8 @@
    dispatch:
      CPU: pow
      CUDA: pow
@@ -5043,7 +5006,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
    variants: method
-@@ -5221,40 +6455,58 @@
+@@ -5221,40 +6445,58 @@
      CPU: normal_cpu_
      CUDA: normal_cuda_
    supports_named_tensor: True
@@ -5102,7 +5065,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: alias(Tensor(a) self) -> Tensor(a)
    variants: method, function
-@@ -5265,16 +6517,22 @@
+@@ -5265,43 +6507,59 @@
    dispatch:
      CPU: legacy::cpu::_th_addr
      CUDA: legacy::cuda::_th_addr
@@ -5125,7 +5088,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
    dispatch:
-@@ -5286,22 +6544,30 @@
+     CPU: legacy::cpu::_th_index_copy_
+     CUDA: legacy::cuda::_th_index_copy_
+-
++  npu_dispatch:
++    NPU: index_copy_npu_
++    
+ - func: _cumsum(Tensor self, int dim) -> Tensor
+   use_c10_dispatcher: full
    dispatch:
      CPU: _cumsum_cpu
      CUDA: legacy::cuda::_th_cumsum
@@ -5156,7 +5126,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _var(Tensor self, bool unbiased=True) -> Tensor
    use_c10_dispatcher: full
-@@ -5309,6 +6575,8 @@
+@@ -5309,6 +6567,8 @@
      CPU: legacy::cpu::_th_var
      CUDA: legacy::cuda::_th_var
    supports_named_tensor: True
@@ -5165,7 +5135,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _std(Tensor self, bool unbiased=True) -> Tensor
    use_c10_dispatcher: full
-@@ -5321,6 +6589,8 @@
+@@ -5321,6 +6581,8 @@
    variants: function
    dispatch:
      CUDA: _amp_non_finite_check_and_unscale_cuda_
@@ -5174,7 +5144,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
    variants: function
-@@ -5332,12 +6602,16 @@
+@@ -5332,12 +6594,16 @@
      CPU: _cat_cpu
      CUDA: cat_cuda
      QuantizedCPU: quantized_cat
@@ -5191,7 +5161,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
    dispatch:
-@@ -5353,36 +6627,50 @@
+@@ -5353,36 +6619,50 @@
    dispatch:
      CPU: legacy::cpu::_th_max
      CUDA: legacy::cuda::_th_max
@@ -5242,7 +5212,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
    use_c10_dispatcher: full
-@@ -5390,23 +6678,33 @@
+@@ -5390,23 +6670,33 @@
    dispatch:
      CPU: mse_loss_backward
      CUDA: mse_loss_backward
@@ -5276,7 +5246,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5434,22 +6732,30 @@
+@@ -5434,22 +6724,30 @@
  
  - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5307,7 +5277,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -5466,97 +6772,137 @@
+@@ -5466,97 +6764,137 @@
  
  - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5445,7 +5415,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5564,6 +6910,8 @@
+@@ -5564,6 +6902,8 @@
      CPU: elu_out
      CUDA: elu_out
      QuantizedCPU: quantized_elu_out
@@ -5454,7 +5424,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
    use_c10_dispatcher: full
-@@ -5572,16 +6920,22 @@
+@@ -5572,16 +6912,22 @@
      CPU: elu
      CUDA: elu
      QuantizedCPU: quantized_elu
@@ -5477,7 +5447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
    python_module: nn
-@@ -5589,12 +6943,16 @@
+@@ -5589,12 +6935,16 @@
      CPU: elu_
      CUDA: elu_
      QuantizedCPU: quantized_elu_
@@ -5494,7 +5464,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: glu(Tensor self, int dim=-1) -> Tensor
    use_c10_dispatcher: full
-@@ -5602,12 +6960,16 @@
+@@ -5602,12 +6952,16 @@
    dispatch:
      CPU: glu
      CUDA: legacy::cuda::_thnn_glu_forward
@@ -5511,7 +5481,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
    use_c10_dispatcher: full
-@@ -5615,20 +6977,30 @@
+@@ -5615,20 +6969,30 @@
    dispatch:
      CPU: glu_backward
      CUDA: legacy::cuda::_thnn_glu_backward
@@ -5542,7 +5512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5636,6 +7008,8 @@
+@@ -5636,6 +7000,8 @@
      CPU: hardtanh_out
      CUDA: hardtanh_out
      QuantizedCPU: quantized_hardtanh_out
@@ -5551,7 +5521,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
    use_c10_dispatcher: full
-@@ -5644,16 +7018,22 @@
+@@ -5644,16 +7010,22 @@
      CPU: hardtanh
      CUDA: hardtanh
      QuantizedCPU: quantized_hardtanh
@@ -5574,7 +5544,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
    python_module: nn
-@@ -5661,6 +7041,8 @@
+@@ -5661,6 +7033,8 @@
      CPU: hardtanh_
      CUDA: hardtanh_
      QuantizedCPU: quantized_hardtanh_
@@ -5583,7 +5553,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5668,6 +7050,8 @@
+@@ -5668,6 +7042,8 @@
      CPU: leaky_relu_out
      CUDA: leaky_relu_out
      QuantizedCPU: quantized_leaky_relu_out
@@ -5592,7 +5562,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
    use_c10_dispatcher: full
-@@ -5676,10 +7060,14 @@
+@@ -5676,10 +7052,14 @@
      CPU: leaky_relu
      CUDA: leaky_relu
      QuantizedCPU: quantized_leaky_relu
@@ -5607,7 +5577,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
    python_module: nn
-@@ -5687,31 +7075,44 @@
+@@ -5687,31 +7067,44 @@
      CPU: leaky_relu_
      CUDA: leaky_relu_
      QuantizedCPU: quantized_leaky_relu_
@@ -5652,7 +5622,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
    use_c10_dispatcher: full
-@@ -5719,6 +7120,8 @@
+@@ -5719,6 +7112,8 @@
    dispatch:
      CPU: log_sigmoid_backward_cpu
      CUDA: legacy::cuda::_thnn_log_sigmoid_backward
@@ -5661,7 +5631,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5744,37 +7147,53 @@
+@@ -5744,37 +7139,53 @@
  
  - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5715,7 +5685,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5782,9 +7201,13 @@
+@@ -5782,9 +7193,13 @@
      CPU: adaptive_avg_pool2d_out_cpu
      CUDA: adaptive_avg_pool2d_out_cuda
      MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
@@ -5729,7 +5699,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
    dispatch:
-@@ -5796,6 +7219,8 @@
+@@ -5796,6 +7211,8 @@
      CPU: adaptive_avg_pool2d_cpu
      CUDA: adaptive_avg_pool2d_cuda
      QuantizedCPU: quantized_adaptive_avg_pool2d
@@ -5738,7 +5708,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5803,24 +7228,32 @@
+@@ -5803,24 +7220,32 @@
    dispatch:
      CPU: adaptive_avg_pool2d_backward_cpu
      CUDA: adaptive_avg_pool2d_backward_cuda
@@ -5771,7 +5741,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5828,6 +7261,8 @@
+@@ -5828,6 +7253,8 @@
    dispatch:
      CPU: adaptive_avg_pool3d_backward_cpu
      CUDA: adaptive_avg_pool3d_backward_cuda
@@ -5780,7 +5750,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5835,6 +7270,8 @@
+@@ -5835,6 +7262,8 @@
    dispatch:
      CPU: adaptive_max_pool2d_out_cpu
      CUDA: adaptive_max_pool2d_out_cuda
@@ -5789,7 +5759,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
-@@ -5842,12 +7279,16 @@
+@@ -5842,12 +7271,16 @@
    dispatch:
      CPU: adaptive_max_pool2d_cpu
      CUDA: adaptive_max_pool2d_cuda
@@ -5806,7 +5776,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
    use_c10_dispatcher: full
-@@ -5855,6 +7296,8 @@
+@@ -5855,6 +7288,8 @@
    dispatch:
      CPU: adaptive_max_pool2d_backward_cpu
      CUDA: adaptive_max_pool2d_backward_cuda
@@ -5815,7 +5785,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5889,6 +7332,8 @@
+@@ -5889,6 +7324,8 @@
      CPU: avg_pool2d_out_cpu
      CUDA: avg_pool2d_out_cuda
      MkldnnCPU: mkldnn_avg_pool2d_out
@@ -5824,7 +5794,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
    python_module: nn
-@@ -5897,24 +7342,32 @@
+@@ -5897,24 +7334,32 @@
      CUDA: avg_pool2d_cuda
      MkldnnCPU: mkldnn_avg_pool2d
      QuantizedCPU: quantized_avg_pool2d
@@ -5857,7 +5827,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
    python_module: nn
-@@ -5922,18 +7375,24 @@
+@@ -5922,18 +7367,24 @@
      CPU: avg_pool3d_cpu
      CUDA: avg_pool3d_cuda
      QuantizedCPU: quantized_avg_pool3d
@@ -5882,7 +5852,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5993,6 +7452,8 @@
+@@ -5993,6 +7444,8 @@
    dispatch:
      CPU: max_pool2d_with_indices_out_cpu
      CUDA: max_pool2d_with_indices_out_cuda
@@ -5891,7 +5861,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -6000,6 +7461,8 @@
+@@ -6000,6 +7453,8 @@
    dispatch:
      CPU: max_pool2d_with_indices_cpu
      CUDA: max_pool2d_with_indices_cuda
@@ -5900,7 +5870,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
-@@ -6007,12 +7470,16 @@
+@@ -6007,12 +7462,16 @@
    dispatch:
      CPU: max_pool2d_with_indices_backward_out_cpu
      CUDA: max_pool2d_with_indices_backward_out_cuda
@@ -5917,7 +5887,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -6020,6 +7487,8 @@
+@@ -6020,6 +7479,8 @@
    dispatch:
      CPU: max_pool3d_with_indices_out_cpu
      CUDA: max_pool3d_with_indices_out_cuda
@@ -5926,7 +5896,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -6027,6 +7496,8 @@
+@@ -6027,6 +7488,8 @@
    dispatch:
      CPU: max_pool3d_with_indices_cpu
      CUDA: max_pool3d_with_indices_cuda
@@ -5935,7 +5905,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
-@@ -6034,12 +7505,17 @@
+@@ -6034,12 +7497,17 @@
    dispatch:
      CPU: max_pool3d_with_indices_backward_out_cpu
      CUDA: max_pool3d_with_indices_backward_out_cuda
@@ -5953,7 +5923,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6118,12 +7594,16 @@
+@@ -6118,12 +7586,16 @@
    dispatch:
      CPU: reflection_pad2d_out_cpu
      CUDA: reflection_pad2d_out_cuda
@@ -5970,7 +5940,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6166,12 +7646,16 @@
+@@ -6166,12 +7638,16 @@
    dispatch:
      CPU: replication_pad2d_out_cpu
      CUDA: replication_pad2d_out_cuda
@@ -5987,7 +5957,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6214,12 +7698,16 @@
+@@ -6214,12 +7690,16 @@
    dispatch:
      CPU: upsample_linear1d_out_cpu
      CUDA: upsample_linear1d_out_cuda
@@ -6004,7 +5974,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6232,12 +7720,16 @@
+@@ -6232,12 +7712,16 @@
    dispatch:
      CPU: upsample_linear1d_backward_cpu
      CUDA: upsample_linear1d_backward_cuda
@@ -6021,7 +5991,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6245,96 +7737,128 @@
+@@ -6245,96 +7729,128 @@
      CPU: upsample_bilinear2d_cpu
      CUDA: upsample_bilinear2d_cuda
      QuantizedCPU: quantized_upsample_bilinear2d_cpu
@@ -6150,7 +6120,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6342,24 +7866,32 @@
+@@ -6342,24 +7858,32 @@
      CPU: upsample_nearest2d_cpu
      CUDA: upsample_nearest2d_cuda
      QuantizedCPU: quantized_upsample_nearest2d_cpu
@@ -6183,7 +6153,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6367,38 +7899,52 @@
+@@ -6367,38 +7891,52 @@
      CPU: upsample_nearest3d_cpu
      CUDA: upsample_nearest3d_cuda
      QuantizedCPU: quantized_upsample_nearest3d_cpu
@@ -6236,7 +6206,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # What's a thnn_conv_ versus a slow_conv_?
  #
-@@ -6423,24 +7969,32 @@
+@@ -6423,24 +7961,32 @@
    dispatch:
      CPU: slow_conv_transpose2d_out_cpu
      CUDA: slow_conv_transpose2d_out_cuda
@@ -6269,7 +6239,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6468,21 +8022,29 @@
+@@ -6468,21 +8014,29 @@
  
  - func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -6299,7 +6269,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    python_module: nn
-@@ -6495,32 +8057,46 @@
+@@ -6495,32 +8049,46 @@
    dispatch:
      CPU: slow_conv2d_backward_cpu
      CUDA: legacy::cuda::_thnn_conv2d_backward
@@ -6346,7 +6316,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6553,12 +8129,16 @@
+@@ -6553,12 +8121,16 @@
    dispatch:
      CPU: slow_conv_dilated2d_cpu
      CUDA: slow_conv_dilated2d_cuda
@@ -6363,7 +6333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
    python_module: nn
-@@ -6577,57 +8157,393 @@
+@@ -6577,57 +8149,401 @@
    dispatch:
      CPU: col2im_out_cpu
      CUDA: col2im_out_cuda
@@ -6548,7 +6518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  npu_dispatch_only:
 +    NPU: nms_v4_npu
 +
-+- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
++- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
 +  variants: function
 +  npu_dispatch_only:
 +    NPU: lstm_npu
@@ -6757,10 +6727,18 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  variants: function, method
 +  npu_dispatch_only:
 +    NPU: masked_fill_range_npu
++
++- func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
++  npu_dispatch_only:
++    NPU: linear_npu
++
++- func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor)
++  npu_dispatch_only:
++    NPU: linear_backward_npu
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
 --- pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-05 14:59:26.496336915 +0800
++++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-09 17:16:47.866792783 +0800
 @@ -659,14 +659,14 @@
  
      SUB x1, x1, 4
@@ -6786,7 +6764,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      CMP x1, 2
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-05 14:59:26.440336488 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-09 17:16:47.810790775 +0800
 @@ -64,7 +64,7 @@
  
  Tensor isinf(const Tensor &self) {
@@ -6798,7 +6776,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "isinf", [&]() {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-05 14:59:26.444336518 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-09 17:16:47.810790775 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6843,7 +6821,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-05 14:59:26.444336518 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-09 17:16:47.810790775 +0800
 @@ -87,6 +87,7 @@
    if (self.is_contiguous(memory_format)) {
      return self;
@@ -6854,7 +6832,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        "preserve memory format is unsupported by the contiguous operator");
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-05 14:59:26.444336518 +0800
++++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-09 17:16:47.814790918 +0800
 @@ -26,7 +26,7 @@
          const scalar_t* in = &idata[output_y * input_width + output_x];
          scalar_t* out = &odata[output_y * output_width + output_x];
@@ -6866,7 +6844,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            out += output_width * output_height;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py
 --- pytorch-v1.5.0/aten/src/ATen/native_parse.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-05 14:59:26.512337037 +0800
++++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-09 17:16:47.878793213 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6904,7 +6882,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  msg = '''Exception raised in processing function:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py
 --- pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-05 14:59:26.512337037 +0800
++++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-09 17:16:47.882793357 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6936,7 +6914,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-05 14:59:26.512337037 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-09 17:16:47.882793357 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6969,7 +6947,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-05 14:59:26.512337037 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-09 17:16:47.882793357 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7003,7 +6981,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/TH/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-05 14:59:26.516337067 +0800
++++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-09 17:16:47.886793500 +0800
 @@ -48,6 +48,11 @@
    ${CMAKE_CURRENT_SOURCE_DIR}
  PARENT_SCOPE)
@@ -7018,7 +6996,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-05 14:59:26.520337098 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-09 17:16:47.886793500 +0800
 @@ -1,9 +1,32 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7127,7 +7105,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-05 14:59:26.520337098 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-09 17:16:47.886793500 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7166,7 +7144,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt
 --- pytorch-v1.5.0/c10/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/CMakeLists.txt	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/CMakeLists.txt	2021-07-09 17:16:47.902794074 +0800
 @@ -63,6 +63,14 @@
    message(STATUS "don't use NUMA")
  endif()
@@ -7195,7 +7173,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # not checked in
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h
 --- pytorch-v1.5.0/c10/core/Backend.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Backend.h	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/Backend.h	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7290,7 +7268,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp
 --- pytorch-v1.5.0/c10/core/Device.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.cpp	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/Device.cpp	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7330,7 +7308,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        types.begin(),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h
 --- pytorch-v1.5.0/c10/core/Device.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.h	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/Device.h	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7365,7 +7343,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return type_ == DeviceType::CPU;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp
 --- pytorch-v1.5.0/c10/core/DeviceType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7405,7 +7383,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        return false;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h
 --- pytorch-v1.5.0/c10/core/DeviceType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.h	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/DeviceType.h	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7448,7 +7426,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  constexpr DeviceType kXLA = DeviceType::XLA;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp
 --- pytorch-v1.5.0/c10/core/DispatchKey.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7480,7 +7458,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      case DispatchKey::TESTING_ONLY_GenericModeTensorId:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h
 --- pytorch-v1.5.0/c10/core/DispatchKey.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.h	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/DispatchKey.h	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7512,7 +7490,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h
 --- pytorch-v1.5.0/c10/core/Storage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Storage.h	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/Storage.h	2021-07-09 17:16:47.902794074 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7546,7 +7524,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  };
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h
 --- pytorch-v1.5.0/c10/core/StorageImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/StorageImpl.h	2021-07-05 14:59:26.532337189 +0800
++++ pytorch-develop/c10/core/StorageImpl.h	2021-07-09 17:16:47.902794074 +0800
 @@ -1,12 +1,39 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7603,7 +7581,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h
 --- pytorch-v1.5.0/c10/core/TensorImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorImpl.h	2021-07-05 14:59:26.536337219 +0800
++++ pytorch-develop/c10/core/TensorImpl.h	2021-07-09 17:16:47.906794218 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7673,7 +7651,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h
 --- pytorch-v1.5.0/c10/core/TensorOptions.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorOptions.h	2021-07-05 14:59:26.536337219 +0800
++++ pytorch-develop/c10/core/TensorOptions.h	2021-07-09 17:16:47.906794218 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7714,7 +7692,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h
 --- pytorch-v1.5.0/c10/macros/Export.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/macros/Export.h	2021-07-05 14:59:26.536337219 +0800
++++ pytorch-develop/c10/macros/Export.h	2021-07-09 17:16:47.906794218 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7841,7 +7819,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt
 --- pytorch-v1.5.0/caffe2/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-05 14:59:26.544337280 +0800
++++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-09 17:16:47.918794647 +0800
 @@ -32,6 +32,7 @@
    # Add source, includes, and libs to lists
    list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@@ -7988,7 +7966,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs.
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format
 --- pytorch-v1.5.0/.clang-format	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.clang-format	2021-07-05 14:59:26.412336274 +0800
++++ pytorch-develop/.clang-format	2021-07-09 17:16:47.778789628 +0800
 @@ -84,5 +84,4 @@
  SpacesInSquareBrackets: false
  Standard:        Cpp11
@@ -7999,7 +7977,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake
 --- pytorch-v1.5.0/cmake/BuildVariables.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-05 14:59:26.652338104 +0800
++++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-09 17:16:48.030798663 +0800
 @@ -11,6 +11,7 @@
  # CMakeLists.txt files under each folder respectively.
  set(Caffe2_CPU_SRCS)
@@ -8026,7 +8004,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # symbols. However, if the lib is whole linked in caffe2 lib, we don't want
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake
 --- pytorch-v1.5.0/cmake/Codegen.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Codegen.cmake	2021-07-05 14:59:26.656338135 +0800
++++ pytorch-develop/cmake/Codegen.cmake	2021-07-09 17:16:48.030798663 +0800
 @@ -191,13 +191,14 @@
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
@@ -8057,7 +8035,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake
 --- pytorch-v1.5.0/cmake/Dependencies.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Dependencies.cmake	2021-07-05 14:59:26.656338135 +0800
++++ pytorch-develop/cmake/Dependencies.cmake	2021-07-09 17:16:48.030798663 +0800
 @@ -1509,6 +1509,13 @@
    ENDIF(NOT C_HAS_THREAD)
  endif()
@@ -8074,7 +8052,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake
 --- pytorch-v1.5.0/cmake/Summary.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Summary.cmake	2021-07-05 14:59:26.656338135 +0800
++++ pytorch-develop/cmake/Summary.cmake	2021-07-09 17:16:48.034798807 +0800
 @@ -134,6 +134,7 @@
    if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
      message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
@@ -8085,7 +8063,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endfunction()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in
 --- pytorch-v1.5.0/cmake/TorchConfig.cmake.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-05 14:59:26.656338135 +0800
++++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-09 17:16:48.034798807 +0800
 @@ -112,6 +112,11 @@
    list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
  endif()
@@ -8100,7 +8078,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt
 --- pytorch-v1.5.0/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/CMakeLists.txt	2021-07-05 14:59:26.412336274 +0800
++++ pytorch-develop/CMakeLists.txt	2021-07-09 17:16:47.782789771 +0800
 @@ -205,6 +205,10 @@
  option(USE_TBB "Use TBB" OFF)
  option(ONNX_ML "Enable traditional ONNX ML API." ON)
@@ -8167,7 +8145,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore
 --- pytorch-v1.5.0/.dockerignore	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.dockerignore	2021-07-05 14:59:26.412336274 +0800
++++ pytorch-develop/.dockerignore	2021-07-09 17:16:47.778789628 +0800
 @@ -1,257 +1 @@
 -# READ THIS BEFORE YOU REFACTOR ME
 -#
@@ -8430,7 +8408,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat
 --- pytorch-v1.5.0/docs/make.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/docs/make.bat	2021-07-05 14:59:26.660338165 +0800
++++ pytorch-develop/docs/make.bat	2021-07-09 17:16:48.038798950 +0800
 @@ -1,36 +1,36 @@
 -@ECHO OFF
 -
@@ -8519,7 +8497,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt
 --- pytorch-v1.5.0/requirements.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/requirements.txt	2021-07-05 14:59:26.676338287 +0800
++++ pytorch-develop/requirements.txt	2021-07-09 17:16:48.054799524 +0800
 @@ -4,4 +4,12 @@
  requests
  setuptools
@@ -8538,7 +8516,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat
 --- pytorch-v1.5.0/scripts/appveyor/install.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install.bat	2021-07-05 14:59:26.676338287 +0800
++++ pytorch-develop/scripts/appveyor/install.bat	2021-07-09 17:16:48.054799524 +0800
 @@ -1,10 +1,10 @@
 -:: Installation scripts for appveyor.
 -
@@ -8562,7 +8540,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +conda install -y numpy
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat
 --- pytorch-v1.5.0/scripts/appveyor/install_cuda.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-05 14:59:26.676338287 +0800
++++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-09 17:16:48.054799524 +0800
 @@ -1,22 +1,22 @@
 -@echo on
 -
@@ -8610,7 +8588,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +nvcc -V || exit /b
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat
 --- pytorch-v1.5.0/scripts/build_windows.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/build_windows.bat	2021-07-05 14:59:26.676338287 +0800
++++ pytorch-develop/scripts/build_windows.bat	2021-07-09 17:16:48.054799524 +0800
 @@ -1,84 +1,84 @@
 -:: #############################################################################
 -:: Example command to build on Windows.
@@ -8782,7 +8760,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +exit /b 1
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1
 --- pytorch-v1.5.0/scripts/proto.ps1	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/proto.ps1	2021-07-05 14:59:26.676338287 +0800
++++ pytorch-develop/scripts/proto.ps1	2021-07-09 17:16:48.054799524 +0800
 @@ -1,17 +1,17 @@
 -param(
 -  [string]$protoc,
@@ -8820,7 +8798,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +Invoke-Expression $cmd
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/setup.py pytorch-develop/setup.py
 --- pytorch-v1.5.0/setup.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/setup.py	2021-07-05 14:59:26.676338287 +0800
++++ pytorch-develop/setup.py	2021-07-09 17:16:48.054799524 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -8919,7 +8897,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  'python/serialized_test/data/operator_test/*.zip',
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml
 --- pytorch-v1.5.0/tools/autograd/derivatives.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-05 14:59:27.812346954 +0800
++++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-09 17:16:49.194840399 +0800
 @@ -107,6 +107,10 @@
  #
  # NB: The parameter names here MUST be consistent with the parameter names
@@ -8976,12 +8954,12 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # The above backward definitions are equivalent to the definitions below.  Why do we bundle
  # everything up?  It's because it's more convenient to define double backwards
  # when there is a single function that manages everything.
-@@ -1630,3 +1643,48 @@
+@@ -1630,3 +1643,52 @@
  
  - name: nonzero(Tensor self) -> Tensor
    output_differentiability: [False]
 +
-+- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
++- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
 +  output_differentiability: [True, True, True, False, False, False, False, False]
 +  input, weight, bias, h, c: npu_lstm_backward(grads[0], grads[1], grads[2], input, weight, bias, h, c, result0, result1, result2, result3, result4, result5, result6, result7)
 +
@@ -9025,10 +9003,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 +- name: npu_mish(Tensor self) -> Tensor
 +  self: npu_mish_backward(grad, self)
++
++- name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
++  input, weight: npu_linear_backward(grad, input, weight)
++  bias: maybe_multiply(grad, 1)
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py
 --- pytorch-v1.5.0/tools/autograd/dump_utils.py	1970-01-01 08:00:00.000000000 +0800
-+++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-05 14:59:27.812346954 +0800
++++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-09 17:16:49.194840399 +0800
 @@ -0,0 +1,112 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# All rights reserved.
@@ -9144,7 +9126,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-05 14:59:27.812346954 +0800
++++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-09 17:16:49.194840399 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9330,7 +9312,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_python_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-09 17:16:49.194840399 +0800
 @@ -1,3 +1,20 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9372,7 +9354,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              'value': argname,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py
 --- pytorch-v1.5.0/tools/autograd/gen_variable_type.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-09 17:16:49.194840399 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9545,7 +9527,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/Functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-09 17:16:49.194840399 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9625,7 +9607,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto sparse = sparse_.coalesce();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-09 17:16:49.194840399 +0800
 @@ -22,7 +22,7 @@
  #include "torch/csrc/autograd/generated/variable_factories.h"
  #include "torch/csrc/utils/structseq.h"
@@ -9709,7 +9691,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-09 17:16:49.194840399 +0800
 @@ -15,7 +15,13 @@
  #include "torch/csrc/cuda/Stream.h"
  #include "torch/csrc/cuda/Event.h"
@@ -9796,7 +9778,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"has_names", (PyCFunction)THPVariable_has_names, METH_NOARGS, NULL},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-09 17:16:49.194840399 +0800
 @@ -1,7 +1,27 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9827,7 +9809,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-09 17:16:49.194840399 +0800
 @@ -1,3 +1,20 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9859,7 +9841,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl
 --- pytorch-v1.5.0/tools/build_variables.bzl	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/build_variables.bzl	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/tools/build_variables.bzl	2021-07-09 17:16:49.198840543 +0800
 @@ -46,6 +46,7 @@
      "torch/csrc/autograd/functions/utils.cpp",
      "torch/csrc/autograd/input_buffer.cpp",
@@ -9945,7 +9927,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py
 --- pytorch-v1.5.0/torch/autograd/profiler.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/autograd/profiler.py	2021-07-05 14:59:27.820347015 +0800
++++ pytorch-develop/torch/autograd/profiler.py	2021-07-09 17:16:49.202840686 +0800
 @@ -1,8 +1,25 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -10418,7 +10400,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return ''.join(result)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt
 --- pytorch-v1.5.0/torch/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/CMakeLists.txt	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/torch/CMakeLists.txt	2021-07-09 17:16:49.198840543 +0800
 @@ -97,6 +97,7 @@
      ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp
      ${TORCH_SRC_DIR}/csrc/utils.cpp
@@ -10450,7 +10432,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/engine.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-05 14:59:27.832347106 +0800
++++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10573,7 +10555,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        auto event = c10::Event{c10::DeviceType::CUDA};
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-05 14:59:27.832347106 +0800
++++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10605,7 +10587,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            /*non_blocking=*/false,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-05 14:59:27.832347106 +0800
++++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10648,7 +10630,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    m.def("_enable_profiler", enableProfiler);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-05 14:59:27.832347106 +0800
++++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10700,7 +10682,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto& old_var = buffer[pos];
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-05 14:59:27.832347106 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10896,7 +10878,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  CUDAStubs::~CUDAStubs() = default;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-05 14:59:27.832347106 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11021,7 +11003,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-05 14:59:27.836347137 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11075,7 +11057,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-05 14:59:27.836347137 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-09 17:16:49.214841116 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11116,7 +11098,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h
 --- pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-05 14:59:27.836347137 +0800
++++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-09 17:16:49.214841116 +0800
 @@ -168,6 +168,45 @@
    return r.release();
  }
@@ -11165,7 +11147,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!r) throw python_error();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-05 14:59:27.832347106 +0800
++++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-09 17:16:49.210840973 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11199,7 +11181,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!t.defined()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-05 14:59:27.836347137 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-09 17:16:49.218841259 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11305,7 +11287,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    while (!in_flight.empty()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-05 14:59:27.836347137 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-09 17:16:49.218841259 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11362,7 +11344,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-05 14:59:27.836347137 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-09 17:16:49.218841259 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11487,7 +11469,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp
 --- pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-05 14:59:27.824347045 +0800
++++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-09 17:16:49.202840686 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11536,7 +11518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return it->second;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp
 --- pytorch-v1.5.0/torch/csrc/Generator.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-05 14:59:27.824347045 +0800
++++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-09 17:16:49.202840686 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11604,7 +11586,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #endif 
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/serialization.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-05 14:59:27.840347168 +0800
++++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-09 17:16:49.222841403 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11704,7 +11686,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/Storage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-05 14:59:27.840347168 +0800
++++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-09 17:16:49.218841259 +0800
 @@ -1,7 +1,25 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11783,7 +11765,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        for (Py_ssize_t i = 0; i < length; i++) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-05 14:59:27.840347168 +0800
++++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-09 17:16:49.222841403 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11831,7 +11813,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp
 --- pytorch-v1.5.0/torch/csrc/Module.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Module.cpp	2021-07-05 14:59:27.824347045 +0800
++++ pytorch-develop/torch/csrc/Module.cpp	2021-07-09 17:16:49.202840686 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11975,7 +11957,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-05 14:59:27.860347320 +0800
++++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-09 17:16:49.242842120 +0800
 @@ -1,18 +1,35 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12352,7 +12334,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +} // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-05 14:59:27.860347320 +0800
++++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-09 17:16:49.242842120 +0800
 @@ -1,6 +1,10 @@
  #include <ATen/core/ivalue.h>
  #include <torch/csrc/utils/init.h>
@@ -12440,7 +12422,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h
 --- pytorch-v1.5.0/torch/csrc/utils/init.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.h	2021-07-05 14:59:27.860347320 +0800
++++ pytorch-develop/torch/csrc/utils/init.h	2021-07-09 17:16:49.242842120 +0800
 @@ -8,4 +8,7 @@
  void initThroughputBenchmarkBindings(PyObject* module);
  
@@ -12451,7 +12433,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h
 --- pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-05 14:59:27.864347350 +0800
++++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-09 17:16:49.242842120 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12486,7 +12468,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return at::Device(device_str);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-05 14:59:27.864347350 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-09 17:16:49.242842120 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12517,7 +12499,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    registerLayoutObject((THPLayout*)strided_layout, at::Backend::QuantizedCPU);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-05 14:59:27.864347350 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-09 17:16:49.242842120 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12653,7 +12635,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else if(expected_layout == c10::kSparse) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-05 14:59:27.864347350 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-09 17:16:49.242842120 +0800
 @@ -1,58 +1,91 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12866,7 +12848,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def get_rng_state(): ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py
 --- pytorch-v1.5.0/torch/distributed/distributed_c10d.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-05 14:59:27.864347350 +0800
++++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-09 17:16:49.246842264 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -12947,7 +12929,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py
 --- pytorch-v1.5.0/torch/distributions/von_mises.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributions/von_mises.py	2021-07-05 14:59:27.868347381 +0800
++++ pytorch-develop/torch/distributions/von_mises.py	2021-07-09 17:16:49.246842264 +0800
 @@ -1,140 +1,140 @@
 -from __future__ import absolute_import, division, print_function
 -
@@ -13231,7 +13213,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +                    _log_modified_bessel_fn(self.concentration, order=0)).exp()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py
 --- pytorch-v1.5.0/torch/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/__init__.py	2021-07-05 14:59:27.816346984 +0800
++++ pytorch-develop/torch/__init__.py	2021-07-09 17:16:49.198840543 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13274,7 +13256,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-05 14:59:27.868347381 +0800
++++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-09 17:16:49.250842407 +0800
 @@ -28,6 +28,10 @@
    option(USE_C10D_NCCL "USE C10D NCCL" ON)
  endif()
@@ -13327,7 +13309,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    copy_header(ProcessGroupMPI.hpp)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-05 14:59:27.872347411 +0800
++++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-09 17:16:49.250842407 +0800
 @@ -37,8 +37,11 @@
  SET_TARGET_PROPERTIES(shm PROPERTIES
    PREFIX "lib"
@@ -13384,7 +13366,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py
 --- pytorch-v1.5.0/torch/nn/functional.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/functional.py	2021-07-05 14:59:27.872347411 +0800
++++ pytorch-develop/torch/nn/functional.py	2021-07-09 17:16:49.254842550 +0800
 @@ -1611,7 +1611,7 @@
      else:
          output = input.matmul(weight.t())
@@ -13407,7 +13389,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -from . import parallel as parallel
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py
 --- pytorch-v1.5.0/torch/nn/modules/batchnorm.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-05 14:59:27.872347411 +0800
++++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-09 17:16:49.254842550 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13439,7 +13421,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              self.register_parameter('running_var', None)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py
 --- pytorch-v1.5.0/torch/nn/modules/module.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/module.py	2021-07-05 14:59:27.876347442 +0800
++++ pytorch-develop/torch/nn/modules/module.py	2021-07-09 17:16:49.254842550 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13582,7 +13564,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py
 --- pytorch-v1.5.0/torch/nn/modules/normalization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-05 14:59:27.876347442 +0800
++++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-09 17:16:49.254842550 +0800
 @@ -128,13 +128,14 @@
      """
      __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
@@ -13615,7 +13597,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          return '{normalized_shape}, eps={eps}, ' \
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in
 --- pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-05 14:59:27.876347442 +0800
++++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-09 17:16:49.254842550 +0800
 @@ -1,60 +1,60 @@
 -from ..init import xavier_uniform_
 -from .activation import MultiheadAttention
@@ -13775,7 +13757,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -                  module_kwargs: Optional[Any] = ...) -> Tensor: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py
 --- pytorch-v1.5.0/torch/nn/parallel/distributed.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-05 14:59:27.876347442 +0800
++++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-09 17:16:49.258842694 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14126,7 +14108,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def remove_weight_norm(module: T_module, name: str = ...) -> T_module: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py
 --- pytorch-v1.5.0/torch/onnx/symbolic_opset9.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-05 14:59:27.880347472 +0800
++++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-09 17:16:49.258842694 +0800
 @@ -1621,14 +1621,23 @@
          slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals]
          return g.op('Concat', *slices, axis_i=0)
@@ -14204,7 +14186,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., lr_decay: float=..., weight_decay: float=..., initial_accumulator_value: float=...,  eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py
 --- pytorch-v1.5.0/torch/optim/adamax.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/optim/adamax.py	2021-07-05 14:59:27.880347472 +0800
++++ pytorch-develop/torch/optim/adamax.py	2021-07-09 17:16:49.262842837 +0800
 @@ -80,8 +80,8 @@
                      exp_inf.mul_(beta2).unsqueeze(0),
                      grad.abs().add_(eps).unsqueeze_(0)
@@ -14381,7 +14363,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py
 --- pytorch-v1.5.0/torch/serialization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/serialization.py	2021-07-05 14:59:27.880347472 +0800
++++ pytorch-develop/torch/serialization.py	2021-07-09 17:16:49.262842837 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14465,7 +14447,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def location_tag(storage):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py
 --- pytorch-v1.5.0/torch/storage.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/storage.py	2021-07-05 14:59:27.880347472 +0800
++++ pytorch-develop/torch/storage.py	2021-07-09 17:16:49.262842837 +0800
 @@ -7,6 +7,7 @@
  
  class _StorageBase(object):
@@ -14485,7 +14467,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          else:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py
 --- pytorch-v1.5.0/torch/tensor.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/tensor.py	2021-07-05 14:59:27.880347472 +0800
++++ pytorch-develop/torch/tensor.py	2021-07-09 17:16:49.262842837 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14547,7 +14529,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      def __reversed__(self):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py
 --- pytorch-v1.5.0/torch/_tensor_str.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_tensor_str.py	2021-07-05 14:59:27.820347015 +0800
++++ pytorch-develop/torch/_tensor_str.py	2021-07-09 17:16:49.198840543 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14601,7 +14583,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      has_default_dtype = self.dtype in (torch.get_default_dtype(), torch.int64, torch.bool)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py
 --- pytorch-v1.5.0/torch/utils/data/dataloader.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-05 14:59:27.884347503 +0800
++++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-09 17:16:49.266842980 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14810,7 +14792,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py
 --- pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-05 14:59:27.884347503 +0800
++++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-09 17:16:49.266842980 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14871,7 +14853,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py
 --- pytorch-v1.5.0/torch/utils/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/__init__.py	2021-07-05 14:59:27.884347503 +0800
++++ pytorch-develop/torch/utils/__init__.py	2021-07-09 17:16:49.266842980 +0800
 @@ -1,6 +1,7 @@
  from __future__ import absolute_import, division, print_function, unicode_literals
  
@@ -14882,7 +14864,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def set_module(obj, mod):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py
 --- pytorch-v1.5.0/torch/_utils.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_utils.py	2021-07-05 14:59:27.820347015 +0800
++++ pytorch-develop/torch/_utils.py	2021-07-09 17:16:49.202840686 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
diff --git a/src/aten/src/ATen/native/native_functions.yaml b/src/aten/src/ATen/native/native_functions.yaml
index afdda6988a665d514a0694374e86b4b5c061430f..4b3a1b7ded4f60281cf4d8dffa66d024ed3f5ef8 100644
--- a/src/aten/src/ATen/native/native_functions.yaml
+++ b/src/aten/src/ATen/native/native_functions.yaml
@@ -2302,8 +2302,6 @@
   requires_tensor: True
   dispatch:
     QuantizedCPU: quantized_max_pool2d
-  npu_dispatch:
-    NPU: quantized_max_pool2d_npu
 
 - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
   supports_named_tensor: True
@@ -2511,13 +2509,9 @@
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   supports_named_tensor: True
   variants: function, method
-  npu_dispatch:
-    NPU: mode_npu
 
 - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   supports_named_tensor: True
-  npu_dispatch:
-    NPU: mode_out_npu
 
 - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -6169,8 +6163,6 @@
   dispatch:
     CPU: legacy::cpu::_th_histc_out
     CUDA: _histc_out_cuda
-  npu_dispatch:
-    NPU: histc_out_npu
 
 - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
   use_c10_dispatcher: full
@@ -6178,8 +6170,6 @@
   dispatch:
     CPU: legacy::cpu::_th_histc
     CUDA: _histc_cuda
-  npu_dispatch:
-    NPU: histc_npu
 
 - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -6538,7 +6528,9 @@
   dispatch:
     CPU: legacy::cpu::_th_index_copy_
     CUDA: legacy::cuda::_th_index_copy_
-
+  npu_dispatch:
+    NPU: index_copy_npu_
+    
 - func: _cumsum(Tensor self, int dim) -> Tensor
   use_c10_dispatcher: full
   dispatch:
@@ -8338,7 +8330,7 @@
   npu_dispatch_only:
     NPU: nms_v4_npu
 
-- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+- func: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
   variants: function
   npu_dispatch_only:
     NPU: lstm_npu
@@ -8546,4 +8538,12 @@
 - func: npu_masked_fill_range(Tensor self, Tensor start, Tensor end, Tensor value, int axis=-1) -> Tensor
   variants: function, method
   npu_dispatch_only:
-    NPU: masked_fill_range_npu
\ No newline at end of file
+    NPU: masked_fill_range_npu
+
+- func: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  npu_dispatch_only:
+    NPU: linear_npu
+
+- func: npu_linear_backward(Tensor grad, Tensor input, Tensor weight) -> (Tensor, Tensor)
+  npu_dispatch_only:
+    NPU: linear_backward_npu
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/AddKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddKernelNpu.cpp
index a2c4c8301ddc9087871c8f6a9538a9553ce14fe5..4d21e8652d355f6aa3c239ea1a7c15ac82d5ec8e 100644
--- a/src/aten/src/ATen/native/npu/AddKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AddKernelNpu.cpp
@@ -47,10 +47,20 @@ Tensor& adds_out_npu_nocheck(
   float alphaValue = CalcuOpUtil::get_scalar_float_value(alpha);
   float value = otherValue * alphaValue;
   OpCommand cmd;
+  std::string real_type = "";
+  if (self.scalar_type() == c10::ScalarType::Bool) {
+    auto unified_result = OpPreparation::binary_op_check(result, self, other, true);
+    if (unified_result.common_type == c10::ScalarType::Bool) {
+      unified_result.common_type = c10::ScalarType::Byte;
+      unified_result.result_type_defined = true;
+      real_type = "uint8";
+    }
+    cmd.Expect(unified_result);
+  }
   cmd.Name("Add")
       .Input(self)
       .Input(value, self.scalar_type())
-      .Output(result)
+      .Output(result, real_type)
       .Run();
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp b/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp
index 97bd9f8b8a0b53cb81b64e1357f3b9385be041a6..15671b097f6143c81c711f56fd54a82d7d351541 100644
--- a/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GridAssignPositiveKernelNpu.cpp
@@ -24,11 +24,11 @@ static inline void grid_assign_positive_check(
     const Tensor& argmax_overlaps,
     const Tensor& gt_argmax_overlaps){
   TORCH_CHECK(
-      at::isIntegralType(argmax_overlaps.scalar_type()) && argmax_overlaps.scalar_type() != ScalarType::Long,
+      at::isIntegralType(argmax_overlaps.scalar_type(), true) && argmax_overlaps.scalar_type() != ScalarType::Long,
       "int32 argmax_overlaps tensor expected but got a tensor with dtype: ",
       argmax_overlaps.scalar_type());
   TORCH_CHECK(
-      at::isIntegralType(gt_argmax_overlaps.scalar_type()) && gt_argmax_overlaps.scalar_type() != ScalarType::Long,
+      at::isIntegralType(gt_argmax_overlaps.scalar_type(), true) && gt_argmax_overlaps.scalar_type() != ScalarType::Long,
       "int32 gt_argmax_overlaps tensor expected but got a tensor with dtype: ",
       gt_argmax_overlaps.scalar_type());
 }
diff --git a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
index b9d0bfe9247169441b616fd38a1cb451cf6aa41b..63970a9d50ce6ce8a50b284dca5d047d11f910a0 100644
--- a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
@@ -23,11 +23,19 @@ using namespace at::native::npu;
 
 Tensor& gt_out_npu_nocheck(Tensor& result, const Tensor& self, const Tensor& other) {
   auto unified_result = OpPreparation::comparison_op_check(result, self, other, true);
+
+  Tensor selfCast = self;
+  Tensor otherCast = other;
+  if(self.dtype() == ScalarType::Bool || other.dtype() == ScalarType::Bool){
+    selfCast = self.to(ScalarType::Float);
+    otherCast = other.to(ScalarType::Float);
+  }
+
   OpCommand cmd;
   cmd.Name("Greater")
      .Expect(unified_result)
-     .Input(self)
-     .Input(other)
+     .Input(selfCast)
+     .Input(otherCast)
      .Output(result)
      .Run();
 
@@ -51,10 +59,15 @@ Tensor& gt_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
 }
 
 Tensor& gt_out_npu_nocheck(Tensor& result, const Tensor& self, Scalar other) {
+  Tensor selfCast = self;
+  if(self.dtype() == ScalarType::Bool){
+    selfCast = self.to(ScalarType::Float);
+  }
+
   OpCommand cmd;
   cmd.Name("Greater")
-     .Input(self)
-     .Input(other, self.scalar_type())
+     .Input(selfCast)
+     .Input(other, selfCast.scalar_type())
      .Output(result)
      .Run();
 
diff --git a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp
index d658eec1deb854ba4fced8595f3c8e9648b1bcef..346d41155fcb41e0b1396267dc8190b1e009ab16 100644
--- a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp
@@ -72,15 +72,16 @@ Tensor& im2col_out_npu_nocheck(Tensor& result, const Tensor &self, IntArrayRef k
   
   TORCH_CHECK(padding.empty() || padding.size() == 1 || padding.size() == 2,
     "im2col: padding must either be omitted, a single int, or a tuple of two ints");
-  padding = padding.empty() ? IntArrayRef({0}) : padding;
-  if (padding.size() == 1) {
-    SmallVector<int64_t, SIZE> pads = {padding[0], padding[0], padding[0], padding[0]};
-    padding = IntArrayRef(pads);
-  } else if (padding.size() == 2) {
-    SmallVector<int64_t, SIZE> pads = {padding[0], padding[0], padding[1], padding[1]};
-    padding = IntArrayRef(pads);
+  auto padding_ = padding.empty() ? IntArrayRef({0}) : padding;
+  SmallVector<int64_t, SIZE> pads;
+  if (padding_.size() == 1) {
+    pads = {padding_[0], padding_[0], padding_[0], padding_[0]};
+  } else if (padding_.size() == 2) {
+    pads = {padding_[0], padding_[0], padding_[1], padding_[1]};
   }
 
+  auto padding_4d = IntArrayRef(pads);
+
   int64_t strideH = 1;
   int64_t strideW = 1;
   if (stride.size() == 1) {
@@ -100,10 +101,11 @@ Tensor& im2col_out_npu_nocheck(Tensor& result, const Tensor &self, IntArrayRef k
     dilationH = dilation[0];
     dilationW = dilation[1];
   }
+
   SmallVector<int64_t, N> kernelSize = {kernel_size[0], kernel_size[1]};
   SmallVector<int64_t, N> stridesSize = {strideH, strideW};
   SmallVector<int64_t, N> dilationsSize = {dilationH, dilationW};
-  SmallVector<int64_t, N> padsSize = {padding[0], padding[1], padding[2], padding[3]};
+  SmallVector<int64_t, N> padsSize = {padding_4d[0], padding_4d[1], padding_4d[2], padding_4d[3]};
   string padding_mode = "CALCULATED";
 
   OpCommand cmd;
@@ -135,12 +137,10 @@ Tensor& im2col_out_npu(Tensor& result, const Tensor &self, IntArrayRef kernel_si
 
 Tensor im2col_npu(const Tensor &self, IntArrayRef kernel_size, IntArrayRef dilation,
                   IntArrayRef padding, IntArrayRef stride) {
-  // calculate the output size
   auto outputSize =
       image_to_col_npu_output_size(self, kernel_size, stride, dilation, padding);
   Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   im2col_out_npu(result, self, kernel_size, dilation, padding, stride);
-
   return result;
 }
 
diff --git a/src/aten/src/ATen/native/npu/IndexCopyKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexCopyKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c0fe1fa25dc4f793e59d084d7792c30641bd432
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/IndexCopyKernelNpu.cpp
@@ -0,0 +1,138 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include<ATen/NamedTensorUtils.h>
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+void index_copy_npu_par_check(
+    const int64_t dim,
+    const Tensor& index,
+    const Tensor& source,
+    const Tensor& result) {
+  int64_t newDim = maybe_wrap_dim(dim, result.dim());
+  TORCH_CHECK_INDEX(index.dim() < 2, "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")");
+
+  int64_t numIndices = index.numel();
+  TORCH_CHECK_INDEX(!(source.dim() == 0 && numIndices != 1), 
+      "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")");
+  TORCH_CHECK_INDEX(!((source.dim() != result.dim()) && (source.dim() != 0 && result.dim() != 0)), 
+      "index_copy_(): When source and destination are not scalars, \
+their dimensionality must match. Source dimensionality (",
+      source.dim(), "), destination dimensionality (", result.dim(), ")");
+  
+  TORCH_CHECK_INDEX(index.scalar_type() == ScalarType::Long, "index_copy_(): Expected LongTensor for index");
+
+  // Check that source and destination slices have the same size
+  auto selfSlicedSizes = result.sizes().vec();
+  if (selfSlicedSizes.size() > 0) {
+    selfSlicedSizes.erase(selfSlicedSizes.begin() + newDim);
+  }
+  auto sourceSlicedSizes = source.sizes().vec();
+  if (sourceSlicedSizes.size() > 0) {
+    sourceSlicedSizes.erase(sourceSlicedSizes.begin() + newDim);
+  }
+  if (selfSlicedSizes.size() != sourceSlicedSizes.size() ||
+      !std::equal(selfSlicedSizes.begin(), selfSlicedSizes.end(),
+                  sourceSlicedSizes.begin())) {
+    std::stringstream ss;
+    ss << "index_copy_(): Source/destination tensor must have same slice shapes. ";
+    ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << newDim;
+    ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0.";
+    TORCH_CHECK(false, ss.str());
+  }
+  TORCH_CHECK_INDEX(source.dim() == 0 || numIndices == source.size(newDim),
+          "index_copy_(): Number of indices (", numIndices, 
+          ") should be equal to source.size(newDim) (", source.size(newDim), ")");
+}
+
+Tensor& index_copy_npu_impl(
+    const int64_t dim,
+    const Tensor& index,
+    const Tensor& source,
+    Tensor& result) {
+  index_copy_npu_par_check(dim, index, source, result);
+  int64_t numIndices = index.numel();
+  int64_t i;
+  if (result.dim() > 1) {
+    Tensor des;
+    Tensor src;
+    for (i = 0; i < numIndices; i++) {
+      des = at::native::select(result, dim, index[i].item<int64_t>());
+      src = at::native::select(source, dim, i);
+      at::native::copy_npu_(des, src);
+    }
+  } else {
+    for (i = 0; i < numIndices; i++) {
+      result[i] = source[index[i].item<int64_t>()];
+    }
+  }
+  return result;
+}
+
+Tensor index_copy_npu(
+    const Tensor& self,
+    const int64_t dim,
+    const Tensor& index,
+    const Tensor& source) {
+  Tensor result(self.clone());
+  return index_copy_npu_impl(dim, index, source, result);
+
+}
+
+Tensor index_copy_npu(
+    const Tensor& self,
+    const Dimname dim, 
+    const Tensor& index,
+    const Tensor& source) {
+  Tensor result(self.clone());
+  return index_copy_npu_impl(dimname_to_position(self, dim), index, source, result);
+}
+
+Tensor& index_copy_npu_(
+    Tensor& self,
+    const int64_t dim,
+    const Tensor& index,
+    const Tensor& source) {
+  Tensor contiguousSelf(self);
+  if (!NpuUtils::check_match(&self)) {
+    contiguousSelf = NpuUtils::format_contiguous(self);
+  } 
+  Tensor result = index_copy_npu_impl(dim, index, source, contiguousSelf);
+  NpuUtils::format_fresh_view(self, result);
+
+  return self;
+}
+
+Tensor& index_copy_npu_(
+    Tensor& self,
+    const Dimname dim, 
+    const Tensor& index,
+    const Tensor& source) {
+  Tensor contiguousSelf(self);
+  if (!NpuUtils::check_match(&self)) {
+    contiguousSelf = NpuUtils::format_contiguous(self);
+  } 
+  Tensor result = index_copy_npu_impl(dimname_to_position(self, dim), index, source, contiguousSelf);
+  NpuUtils::format_fresh_view(self, result);
+
+  return self;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/HistcKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp
similarity index 38%
rename from src/aten/src/ATen/native/npu/HistcKernelNpu.cpp
rename to src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp
index 61624a31693ca083e6f3eb3f0e0ed19ddc405b90..492007773216401faeafdadc57ea23b482ea4e59 100644
--- a/src/aten/src/ATen/native/npu/HistcKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp
@@ -14,65 +14,48 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
 using namespace at::native::npu;
 
-#define FLT_EPSILON     1.19209290E-07F
-
-bool is_zero(float x)
-{
-	if(x > -FLT_EPSILON && x < FLT_EPSILON){
-		return true;
-	}
-	else{
-		return false;
-	}
-}
-
-Tensor& histc_out_npu(
+Tensor linear_backward_out_npu(
     Tensor& result,
-    const Tensor& self,
-    int64_t bins,
-    Scalar min,
-    Scalar max) {
+    const Tensor& input,
+    const Tensor& weight,
+    bool transpose_x1,
+    bool transpose_x2) {
+  int64_t offset_x = 0;
   OpCommand cmd;
-  float max_value = CalcuOpUtil::get_scalar_float_value(max);
-  float min_value = CalcuOpUtil::get_scalar_float_value(min);
-
-  if(max_value == min_value && is_zero(max_value)){
-    // Execute reduce_max_d and reduce_min_d to get the min and max value
-    Tensor res_max = at::max(self);
-    Tensor res_min = at::min(self);
-    
-    max_value = CalcuOpUtil::get_scalar_float_value(res_max.item());
-    min_value = CalcuOpUtil::get_scalar_float_value(res_min.item());
-  }
-  cmd.Name("HistogramD")
-      .Input(self)
-      .Attr("bins", bins)
-      .Attr("min", min_value)
-      .Attr("max", max_value)
+  cmd.Name("MatMulV2")
+      .Input(input)
+      .Input(weight)
       .Output(result)
+      .Attr("transpose_x1", transpose_x1)
+      .Attr("transpose_x2", transpose_x2)
+      .Attr("offset_x", offset_x)
       .Run();
-  
   return result;
 }
 
-Tensor histc_npu(const Tensor& self, int64_t bins, Scalar min, Scalar max) {
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      {bins}, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
-  histc_out_npu(result, self, bins, min, max);
-
-  return result;
+tuple<Tensor, Tensor> linear_backward_npu(
+    const Tensor& grad,
+    const Tensor& input,
+    const Tensor& weight) {
+  SmallVector<int64_t, SIZE> inputGradOutputSize = {
+      grad.size(0), 
+      weight.size(1)};
+  SmallVector<int64_t, SIZE> weightGradOutputSize = {
+      grad.size(1), 
+      input.size(1)};
+  Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize);
+  Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize);
+
+  linear_backward_out_npu(inputGrad, grad, weight, false, false);
+  linear_backward_out_npu(weightGrad, grad, input, true, false);
+  
+  return std::tie(inputGrad, weightGrad);
 }
 
 } // namespace native
diff --git a/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f35c5de9fa2450586a10e89843f7d653b99207ba
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor linear_npu(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias) {
+  SmallVector<int64_t, SIZE> outputSize = {input.size(0), weight.size(0)};
+  Tensor output = OpPreparation::ApplyTensor(input, outputSize);
+
+  int64_t offset_x = 0;
+  OpCommand cmd;
+  cmd.Name("MatMulV2")
+      .Input(input)
+      .Input(weight);
+  if (bias.defined()) {
+    cmd.Input(bias);
+  }
+  cmd.Output(output)
+      .Attr("transpose_x1", false)
+      .Attr("transpose_x2", true)
+      .Attr("offset_x", offset_x)
+      .Run();
+  
+  return output;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp
index c158f0309e2672a911e3144e7e302fbce01bc82b..d526dd2b568ab939801da6cda83e4f0d81ee01ec 100644
--- a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp
@@ -25,6 +25,7 @@ tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> lstm_npu(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
+    const Tensor& seqMask,
     const Tensor& h,
     const Tensor& c,
     bool has_biases,
@@ -32,7 +33,9 @@ tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> lstm_npu(
     double dropout,
     bool train,
     bool bidirectional,
-    bool batch_first) { 
+    bool batch_first,
+    bool flagSeq,
+    bool flagDirection) { 
   // calculate the output size
   int64_t numStep = input.size(0);
   int64_t batchSize = input.size(1);
@@ -49,41 +52,83 @@ tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> lstm_npu(
   Tensor fOutput = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_FRACTAL_NZ);
   Tensor oOutput = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_FRACTAL_NZ);
   Tensor tanhc = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_FRACTAL_NZ); 
-  
+ 
+  string direction = flagDirection? "REDIRECTIONAL" : "UNIDIRECTIONAL";
   OpCommand cmd;
   cmd.Name("DynamicRNN")
-      .Input(input)
-      .Input(weight)
-      .Input(bias)
-      .Input()
-      .Input(h)
-      .Input(c)
-      .Output(yOutput)
-      .Output(hOutput)
-      .Output(cOutput)
-      .Output(iOutput)
-      .Output(jOutput)
-      .Output(fOutput)
-      .Output(oOutput)
-      .Output(tanhc)
-      .Attr("cell_type", (string)"LSTM")
-      .Attr("direction", (string)"UNIDIRECTIONAL")
-      .Attr("cell_depth", (int64_t)1)
-      .Attr("use_peephole", (bool)false)
-      .Attr("keep_prob", (float)1.0)
-      .Attr("cell_clip", (float)-1.0)
-      .Attr("num_proj", (int64_t)0)
-      .Attr("time_major", (bool)true)
-      .Attr("activation", (string)"tanh")
-      .Attr("forget_bias", (float)0.0)
-      .Attr("is_training", train)
-      .Run();
+    .Input(input, "x")
+    .Input(weight, "w")
+    .Input(bias, "b");
+     
+  //if input is PackSequence, seqMask is not None,  Otherwise, it is None.   
+  if (!flagSeq){
+    cmd.Input();
+  } else{
+    cmd.Input(seqMask, "seq_length"); 
+  }      
+    cmd.Input(h, "init_h")
+    .Input(c, "init_c")
+    .Output(yOutput)
+    .Output(hOutput)
+    .Output(cOutput)
+    .Output(iOutput)
+    .Output(jOutput)
+    .Output(fOutput)
+    .Output(oOutput)
+    .Output(tanhc)
+    .Attr("cell_type", (string)"LSTM")
+    .Attr("direction", direction)
+    .Attr("cell_depth", (int64_t)1)
+    .Attr("use_peephole", (bool)false)
+    .Attr("keep_prob", (float)1.0)
+    .Attr("cell_clip", (float)-1.0)
+    .Attr("num_proj", (int64_t)0)
+    .Attr("time_major", (bool)true)
+    .Attr("activation", (string)"tanh")
+    .Attr("forget_bias", (float)0.0)
+    .Attr("is_training", train)
+    .Run();
 
+  //std::cout<<"yOutput: "<<yOutput<<std::endl;
   return std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>(
-      yOutput, hOutput, cOutput, iOutput, jOutput, fOutput, oOutput, tanhc);
+    yOutput, hOutput, cOutput, iOutput, jOutput, fOutput, oOutput, tanhc);
 }
 
-tuple<Tensor, Tensor, Tensor> lstm_npu(
+tuple<Tensor, Tensor> get_wb_single_layer_direc(
+  const Tensor& input,
+  TensorList params,
+  bool hasBiases) {
+  // get weight
+  Tensor ihWeight = params[0];
+  Tensor hhWeight = params[1];
+	
+  Tensor weight = at::cat({ihWeight, hhWeight}, 1).t().to(input.dtype());
+  
+  // get bias
+  Tensor bias = at::zeros(weight.size(1), weight.options());
+  if (hasBiases) {
+    bias = at::add(params[2], params[3]).to(input.dtype());
+  }
+  return std::tie(weight, bias);
+}
+
+tuple<Tensor, Tensor> get_wb_double_layer_or_bidirec(
+  const Tensor& input,
+  TensorList params,
+  bool hasBiases) {
+  Tensor weight;
+  Tensor bias; 
+  if (hasBiases) {
+    weight = at::cat({params[4], params[5]}, 1).t().to(input.dtype());
+    bias = at::add(params[6], params[7]).to(input.dtype());
+  } else {
+    weight = at::cat({params[2], params[3]}, 1).t().to(input.dtype());
+    bias = at::zeros(weight.size(1), weight.options());
+  }
+  return std::tie(weight, bias);
+}
+
+tuple<Tensor, Tensor, Tensor> lstm_single_layer_direc_npu(
     const Tensor& input,
     TensorList hx,
     TensorList params,
@@ -92,7 +137,8 @@ tuple<Tensor, Tensor, Tensor> lstm_npu(
     double dropout,
     bool train,
     bool bidirectional,
-    bool batchFirst) { 
+    bool batchFirst,
+    bool direction) {
   int64_t numStep = input.size(0);
   
   // get weight
@@ -110,51 +156,169 @@ tuple<Tensor, Tensor, Tensor> lstm_npu(
   // get init_h, init_c 
   Tensor h = hx[0];
   Tensor c = hx[1];
-  if(numLayers == 2)
-  {
-    h = hx[0].slice(0, 0, 1);
-    c = hx[1].slice(0, 0, 1);
-  }
-
-  auto results = at::npu_lstm(
-    input, weight, bias, h, c, hasBiases, numLayers, dropout, train, bidirectional, batchFirst);
+  
+  Tensor seqMask = at::empty({0}, input.options());
+  auto results = at::npu_lstm(input, weight, bias, seqMask, h, c, hasBiases, numLayers, dropout, 
+    train, bidirectional, batchFirst, false, direction);
 
   // get the last dimension of the T-axis	
   Tensor thOutput = at::unsqueeze(std::get<1>(results)[numStep-1], 0);
   Tensor tcOutput = at::unsqueeze(std::get<2>(results)[numStep-1], 0);
-  
-  //double layer LSTM
-  if (numLayers == 2) {
+
+  return std::tie(std::get<0>(results), thOutput, tcOutput);
+}
+
+tuple<Tensor, Tensor, Tensor> lstm_single_layer_bidirec_npu(
+    const Tensor& input,
+    TensorList hx,
+    TensorList params,
+    bool hasBiases,
+    int64_t numLayers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batchFirst) {
+    int64_t numStep = input.size(0);
+    //get h and c of forward direction
+    Tensor h = hx[0].slice(0, 0, 1);
+    Tensor c = hx[1].slice(0, 0, 1);
+    
+    //caculate forward direction, direction of attr is UNIDIRECTIONAL(npu_lstm need add the attr of direction)
+    auto resultsForward = lstm_single_layer_direc_npu(input, {h, c}, params, hasBiases, 
+        numLayers, dropout, train, bidirectional, batchFirst, false); 
+
+    //get w/ b/ h/ c of backward direction
+    Tensor weightBack;
+    Tensor biasBack;
+    Tensor hBack = hx[0].slice(0, 1, 2);
+    Tensor cBack = hx[1].slice(0, 1, 2);
+    std::tie(weightBack, biasBack) = get_wb_double_layer_or_bidirec(input, params, hasBiases);
+
+    Tensor seqMask = at::empty({0}, input.options());
+    //caculate forward direction, direction of attr is REDIRECTIONAL
+    auto resultsBackward = at::npu_lstm(input, weightBack, biasBack, seqMask, hBack, cBack, 
+        hasBiases, numLayers, dropout, train, bidirectional, batchFirst, false, true);
+    
+    // get the first dimension of the T-axis when caculate reverse direction	
+    Tensor thOutput = at::unsqueeze(std::get<1>(resultsBackward)[0], 0);
+    Tensor tcOutput = at::unsqueeze(std::get<2>(resultsBackward)[0], 0);
+
+    Tensor y = at::cat({std::get<0>(resultsForward), std::get<0>(resultsBackward)}, 2); 
+    Tensor hOut = at::cat({std::get<1>(resultsForward), thOutput}, 0);
+    Tensor cOut = at::cat({std::get<2>(resultsForward), tcOutput}, 0);
+
+    return std::tie(y, hOut, cOut);
+}
+
+tuple<Tensor, Tensor, Tensor> lstm_double_layer_direc_npu(
+    const Tensor& input,
+    TensorList hx,
+    TensorList params,
+    bool hasBiases,
+    int64_t numLayers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batchFirst) {
+    int64_t numStep = input.size(0);
+    //get h and c of first layer
+    Tensor h = hx[0].slice(0, 0, 1);
+    Tensor c = hx[1].slice(0, 0, 1);
+    
+    //caculate first layer
+    auto results = lstm_single_layer_direc_npu(input, {h, c}, params, hasBiases, 
+        numLayers, dropout, train, bidirectional, batchFirst, false); 
+
+    //get w/ b/ h/ c of twice layer
     Tensor weight2Layer;
     Tensor bias2Layer;
     Tensor h2layer = hx[0].slice(0, 1, 2);
     Tensor c2layer = hx[1].slice(0, 1, 2);
-    if (hasBiases) {
-      weight2Layer = at::cat({params[4], params[5]}, 1).t().to(input.dtype());
-      bias2Layer = at::add(params[6], params[7]).to(input.dtype());
-    } else {
-      weight2Layer = at::cat({params[2], params[3]}, 1).t().to(input.dtype());
-      bias2Layer = at::zeros(weight2Layer.size(1), weight2Layer.options());
-    }
+    std::tie(weight2Layer, bias2Layer) = get_wb_double_layer_or_bidirec(input, params, hasBiases);
     
     //output of first layer as input of second layer
     Tensor input2Layer = std::get<0>(results);
     
+    Tensor seqMask = at::empty({0}, input.options());
     //caculate output of second layer
-    auto results2Layer = at::npu_lstm(input2Layer, weight2Layer, bias2Layer, h2layer, c2layer, 
-    hasBiases, numLayers, dropout, train, bidirectional, batchFirst);
+    auto results2Layer = at::npu_lstm(input2Layer, weight2Layer, bias2Layer, seqMask, h2layer, c2layer, 
+        hasBiases, numLayers, dropout, train, bidirectional, batchFirst, false, false);
     Tensor thOutput2Layer = at::unsqueeze(std::get<1>(results2Layer)[numStep-1], 0);
     Tensor tcOutput2Layer = at::unsqueeze(std::get<2>(results2Layer)[numStep-1], 0);
-    Tensor th = at::cat({thOutput, thOutput2Layer}, 0);
-    Tensor tc = at::cat({tcOutput, tcOutput2Layer}, 0);
+    Tensor th = at::cat({std::get<1>(results), thOutput2Layer}, 0);
+    Tensor tc = at::cat({std::get<2>(results), tcOutput2Layer}, 0); 
 
-    return std::tie(std::get<0>(results2Layer), th, tc);
+    return std::tie(std::get<0>(results2Layer), th, tc); 
+}
+
+tuple<Tensor, Tensor, Tensor> lstm_npu(
+    const Tensor& _input,
+    TensorList hx,
+    TensorList params,
+    bool hasBiases,
+    int64_t numLayers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batchFirst) {
+    //The operator of DynamicRnn only supports the T axis as the first axis.
+    auto input = batchFirst ? _input.transpose(0, 1) : _input;
+    
+    Tensor y;
+    Tensor h;
+    Tensor c;
+    //single layer
+    if(numLayers == 1){
+      if(!bidirectional){
+        std::tie(y, h, c) = lstm_single_layer_direc_npu(input, hx, params, hasBiases, numLayers, 
+          dropout, train, bidirectional, batchFirst, false);
+      } else {
+        std::tie(y, h, c) = lstm_single_layer_bidirec_npu(input, hx, params, hasBiases, numLayers, 
+          dropout, train, bidirectional, batchFirst);
+      }
+    }
+
+    //double layer
+    if((numLayers == 2) && (!bidirectional)) {
+      std::tie(y, h, c) = lstm_double_layer_direc_npu(input, hx, params, hasBiases, numLayers, 
+        dropout, train, bidirectional, batchFirst);
+    } 
+    return std::tie(y, h, c);
+}
+
+Tensor get_mask(const Tensor& input, const Tensor& batchSizes, const Tensor& h, int64_t maxLen){
+  //caculate lengths, but input expected to be sorted
+  std::vector<int64_t> lens;
+  for (int64_t i = 0; i < input.size(1); ++i){
+    auto batchSizesTemp = at::sub(batchSizes , i);
+    auto batchSizesBool = at::gt(batchSizesTemp, 0); 
+    auto batchSizesInt = batchSizesBool.to(ScalarType::Int);
+    auto coutLen = at::sum(batchSizesInt, ScalarType::Int);
+    int64_t len = coutLen.item().toInt();
+    lens.emplace_back(len);
   }
+  Tensor length = CalcuOpUtil::copy_tensor_host_to_device(
+    from_blob(lens.data(), {lens.size()}, at::kLong));    
   
-  return std::tie(std::get<0>(results), thOutput, tcOutput);
+  SmallVector<Tensor, N> maskList;
+  //Slice by T axis
+  for (int64_t i = 0; i < maxLen; ++i) {    
+    //cacl mask
+    Tensor maskTemp1 = at::gt(length, i);
+    Tensor maskTemp2 = maskTemp1.reshape({1, input.size(1), 1});
+     
+    //mask need to be expanded to (1,batch_size,hidden_size)
+    Tensor maskExpand = maskTemp2.expand({1, input.size(1), h.size(2)});
+    maskList.emplace_back(maskExpand);
+  }
+  
+  //mask mast be half
+  Tensor mask = at::cat(maskList, 0).to(ScalarType::Half);
+
+  return mask;
 }
 
-std::tuple<Tensor, Tensor, Tensor> lstm_npu(
+std::tuple<Tensor, Tensor, Tensor> lstm_onelayer_direc_packseq(
   const Tensor& data, const Tensor& batchSizes, TensorList hx,
   TensorList params, bool hasBiases,
   int64_t numLayers, double dropoutP, bool train, bool bidirectional) {
@@ -170,53 +334,153 @@ std::tuple<Tensor, Tensor, Tensor> lstm_npu(
   // get init_h, init_c 
   Tensor h = hx[0];
   Tensor c = hx[1];
+  
+  int64_t numStep = input.size(0);
+  
+  // get weight
+  Tensor ihWeight = params[0];
+  Tensor hhWeight = params[1];	
+  Tensor weight = at::cat({ihWeight, hhWeight}, 1).t().to(input.dtype());
+  
+  // get bias
+  Tensor bias = at::zeros(weight.size(1), weight.options());
+  if (hasBiases) {
+    bias = at::add(params[2], params[3]).to(input.dtype());
+  }
 
   int64_t maxLen = input.size(0);
-  std::vector<at::Tensor> outputs;
-  std::vector<at::Tensor> hxPrev = {h, c}; 
 
-  //caculate lengths, but input expected to be sorted
-  std::vector<int64_t> lens;
-  for (int64_t i = 0; i < input.size(1); ++i){
-    auto batchSizesTemp = at::sub(batchSizes , i);
-    auto batchSizesBool = at::gt(batchSizesTemp, 0); 
-    auto batchSizesInt = batchSizesBool.to(ScalarType::Int);
-    auto coutLen = at::sum(batchSizesInt, ScalarType::Int);
-    int64_t len = coutLen.item().toInt();
-    lens.emplace_back(len);
-  }
-  Tensor length = CalcuOpUtil::copy_tensor_host_to_device(
-    from_blob(lens.data(), {lens.size()}, at::kLong));
-
-  //Slice by T axis
-  for (int64_t i = 0; i < maxLen; ++i) {
-    Tensor step = input.slice(0, i, i + 1).contiguous().reshape({1, input.size(1), input.size(2)});
+  Tensor mask = get_mask(input, batchSizes, h, maxLen);
+  auto results = at::npu_lstm(input, weight, bias, mask, h, c, hasBiases, numLayers, 
+    dropoutP, train, bidirectional, false, true, false);  
     
-    //calculate output of each times
-    auto results = lstm_npu(step, hxPrev, params, hasBiases, numLayers, dropoutP, train, bidirectional, batchFirst);
-    
-    //get previous result
-    Tensor outputTemp = std::get<0>(results);
-    std::vector<at::Tensor> hxCurr = {std::get<1>(results), std::get<2>(results)};
+  Tensor thOutput = at::unsqueeze(std::get<1>(results)[numStep-1], 0);
+  Tensor tcOutput = at::unsqueeze(std::get<2>(results)[numStep-1], 0);
+  
+  return std::tuple<Tensor, Tensor, Tensor>(std::get<0>(results), thOutput, tcOutput);  
+}
 
-    //cacl mask
-    Tensor maskTemp = at::gt(length, i);
-    Tensor mask = maskTemp.reshape({1, input.size(1), 1});
+std::tuple<Tensor, Tensor, Tensor> lstm_onelayer_bidirec_packseq(
+  const Tensor& data, const Tensor& batchSizes, TensorList hx,
+  TensorList params, bool hasBiases,
+  int64_t numLayers, double dropoutP, bool train, bool bidirectional) {
+  //length of T axis
+  int64_t t_size = batchSizes.numel();
+  
+  //T * B **
+  Tensor input = data.reshape({t_size, data.size(0)/t_size, data.size(1)});
+
+  // batch_first is false
+  bool batchFirst = false;
+
+  //get h and c of forward direction
+  Tensor h = hx[0].slice(0, 0, 1);
+  Tensor c = hx[1].slice(0, 0, 1);
 
-    //calculate real output of each times
-    Tensor maskNeg = at::logical_not(mask);
-    Tensor output = at::mul(outputTemp, mask);
+  auto resultsForward = lstm_onelayer_direc_packseq(data, batchSizes, {h, c}, params, hasBiases,
+    numLayers, dropoutP, train, bidirectional);
 
-    //updata hx
-    h = at::mul(mask, hxCurr[0]) + at::mul(maskNeg, hxPrev[0]);
-    c = at::mul(mask, hxCurr[1]) + at::mul(maskNeg, hxPrev[1]);    
-    hxPrev = {h, c};
+  //get w/ b/ h/ c of backward direction
+  Tensor hBack = hx[0].slice(0, 1, 2);
+  Tensor cBack = hx[1].slice(0, 1, 2);
+  
+  Tensor weightBack;
+  Tensor biasBack;
+  std::tie(weightBack, biasBack) = get_wb_double_layer_or_bidirec(input, params, hasBiases);
+
+  int64_t maxLen = input.size(0);
+
+  Tensor mask = get_mask(input, batchSizes, h, maxLen);
+  //caculate forward direction, direction of attr is REDIRECTIONAL
+  auto resultsBackward = at::npu_lstm(input, weightBack, biasBack, mask, hBack, cBack, 
+      hasBiases, numLayers, dropoutP, train, bidirectional, batchFirst, true, true); 
+
+  // get the first dimension of the T-axis when caculate reverse direction	
+  Tensor thOutput = at::unsqueeze(std::get<1>(resultsBackward)[0], 0);
+  Tensor tcOutput = at::unsqueeze(std::get<2>(resultsBackward)[0], 0);
+  
+  Tensor y = at::cat({std::get<0>(resultsForward), std::get<0>(resultsBackward)}, 2); 
+  Tensor hOut = at::cat({std::get<1>(resultsForward), thOutput}, 0);
+  Tensor cOut = at::cat({std::get<2>(resultsForward), tcOutput}, 0);
 
-    outputs.push_back(output);
-  }  
-  Tensor result = at::cat(outputs, 0);
+  return std::tie(y, hOut, cOut);
+}
+
+std::tuple<Tensor, Tensor, Tensor> lstm_double_layer_direc_packseq(
+  const Tensor& data, const Tensor& batchSizes, TensorList hx,
+  TensorList params, bool hasBiases,
+  int64_t numLayers, double dropoutP, bool train, bool bidirectional) {
+  //length of T axis
+  int64_t t_size = batchSizes.numel();
   
-  return std::tie(result, h, c);    
+  //T * B **
+  Tensor input = data.reshape({t_size, data.size(0)/t_size, data.size(1)});
+
+  // batch_first is false
+  bool batchFirst = false;
+
+  //get h and c of forward direction
+  Tensor h = hx[0].slice(0, 0, 1);
+  Tensor c = hx[1].slice(0, 0, 1);
+
+  int64_t numStep = input.size(0);
+
+  auto results = lstm_onelayer_direc_packseq(data, batchSizes, {h, c}, params, hasBiases,
+    numLayers, dropoutP, train, bidirectional);
+
+  //get w/ b/ h/ c of twice layer
+  Tensor weight2Layer;
+  Tensor bias2Layer;
+  Tensor h2layer = hx[0].slice(0, 1, 2);
+  Tensor c2layer = hx[1].slice(0, 1, 2);
+  std::tie(weight2Layer, bias2Layer) = get_wb_double_layer_or_bidirec(input, params, hasBiases);
+
+  int64_t maxLen = input.size(0);
+
+  Tensor mask = get_mask(input, batchSizes, h, maxLen);
+
+  //output of first layer as input of second layer
+  Tensor input2Layer = std::get<0>(results);
+
+  //caculate output of second layer
+  auto results2Layer = at::npu_lstm(input2Layer, weight2Layer, bias2Layer, mask, h2layer, c2layer, 
+    hasBiases, numLayers, dropoutP, train, bidirectional, batchFirst, true, false);
+  Tensor thOutput2Layer = at::unsqueeze(std::get<1>(results2Layer)[numStep-1], 0);
+  Tensor tcOutput2Layer = at::unsqueeze(std::get<2>(results2Layer)[numStep-1], 0);
+  Tensor th = at::cat({std::get<1>(results), thOutput2Layer}, 0);
+  Tensor tc = at::cat({std::get<2>(results), tcOutput2Layer}, 0); 
+
+  return std::tie(std::get<0>(results2Layer), th, tc);  
+}
+
+std::tuple<Tensor, Tensor, Tensor> lstm_npu(
+  const Tensor& data, const Tensor& batchSizes, TensorList hx,
+  TensorList params, bool hasBiases,
+  int64_t numLayers, double dropoutP, bool train, bool bidirectional) {
+  Tensor y;
+  Tensor h;
+  Tensor c;
+
+  // batch_first is false
+  bool batchFirst = false;
+
+  //single layer
+  if(numLayers == 1){
+    if(!bidirectional){
+      std::tie(y, h, c) = lstm_onelayer_direc_packseq(data, batchSizes, hx, params, hasBiases, 
+        numLayers, dropoutP, train, bidirectional);
+    } else {
+      std::tie(y, h, c) = lstm_onelayer_bidirec_packseq(data, batchSizes, hx, params, hasBiases, 
+        numLayers, dropoutP, train, bidirectional);
+    }
+  }
+
+  //double layer
+  if((numLayers == 2) && (!bidirectional)) {
+    std::tie(y, h, c) = lstm_double_layer_direc_packseq(data, batchSizes, hx, params, hasBiases, 
+      numLayers, dropoutP, train, bidirectional);
+  } 
+  return std::tie(y, h, c);
 }
 
 } // namespace native
diff --git a/src/aten/src/ATen/native/npu/ModeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ModeKernelNpu.cpp
deleted file mode 100644
index a1a771322f09ac13f51b25d53e85eb821682a7e9..0000000000000000000000000000000000000000
--- a/src/aten/src/ATen/native/npu/ModeKernelNpu.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<int64_t, SIZE> mode_npu_output_size(
-  const Tensor& self,
-  int64_t dim, 
-  bool keepdim) {
-  SmallVector<int64_t, SIZE> outputSize;
-  if(dim==0){
-    outputSize={self.size(1)};
-  };
-  if(dim==-1 || dim==1){
-    outputSize={self.size(0)};
-  };
-  return outputSize;
-}
-
-SmallVector<NPUTensorDesc, N> mode_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> mode_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  auto outputs = CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-
-  string indicesRealType = "int64";
-  outputs[outputs.size() - 1].realDataType = indicesRealType;
-  return outputs;
-}
-
-SmallVector<NPUAttrDesc, N> mode_npu_attr(
-     int64_t dim, bool keepdim) {
-  NPUAttrDesc npuAttrDim = NPUAttrDesc("dim", dim);
-  NPUAttrDesc npuAttrKeepdim = NPUAttrDesc("keepdim", keepdim);
-
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrDim,
-                                       npuAttrKeepdim};
-  return attrs;
-}
-
-
-tuple<Tensor&, Tensor&> mode_out_npu(
-    Tensor& values,
-    Tensor& indices,
-    const Tensor& self,
-    int64_t dim, 
-    bool keepdim) {
-  // constructs the input and output NPUTensorDesc
-  auto inputs = mode_npu_input({self});
-  auto outputs = mode_npu_output({values, indices});
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = mode_npu_attr(dim,keepdim);
-
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate(
-      "Mode", inputs, outputs, attrs);
-  
-  return tuple<Tensor&, Tensor&>(values, indices);
-}
-
-tuple<Tensor&, Tensor&> _mode_out_npu(
-    Tensor& values,
-    Tensor& indices,
-    const Tensor& self,
-    int64_t dim, 
-    bool keepdim) {
-    
-  return mode_out_npu(values,indices,self, dim, keepdim);
-}
-
-tuple<Tensor, Tensor> mode_npu(
-    const Tensor& self,
-    int64_t dim, 
-    bool keepdim
-) {
-  // calculate the output size
-  auto outputSize = mode_npu_output_size(self, dim,keepdim);
-
-  // construct the output tensor of the NPU
-  Tensor values= at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  Tensor indices = at::empty_with_format(
-      outputSize, self.options().dtype(at::kLong), CalcuOpUtil::get_tensor_npu_format(self));
-
-
-  // calculate the output result of the NPU
-  mode_out_npu(
-      values, indices, self, dim,keepdim);
-  return tuple<Tensor, Tensor>(values, indices);
-
-}
-
-tuple<Tensor, Tensor> _mode_npu(
-    const Tensor& self,
-    int64_t dim, 
-    bool keepdim
-) {
-  return mode_npu(self,dim, keepdim);
-  }
-
-} // namespace native
-} // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp
index ae5b445928d4be0289ec7b77c0387ec1a1ccc62e..5d0a319d9e6cecb1914424eb873c5919059b42e8 100644
--- a/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NormalizeBatchKernelNpu.cpp
@@ -37,7 +37,7 @@ static inline void normalize_batch_check(
       "self num ",
       self.size(0));
   TORCH_CHECK(
-      1 >= normalize_type >= 0,
+      normalize_type >= 0 && normalize_type <= 1,
       "normalize_type expected to be in range [0, 1], but got ",
       normalize_type);
 }
diff --git a/src/aten/src/ATen/native/npu/QuantizedMaxPool2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/QuantizedMaxPool2dKernelNpu.cpp
deleted file mode 100644
index f85580541670bc9b139dc2f5cb7308af4e85dc90..0000000000000000000000000000000000000000
--- a/src/aten/src/ATen/native/npu/QuantizedMaxPool2dKernelNpu.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> quantized_max_pool2d_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> quantized_max_pool2d_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  auto outputs = CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-
-  return outputs;
-}
-
-SmallVector<NPUAttrDesc, N> quantized_max_pool2d_npu_attr(
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode) {
-  int64_t strideH = 1;
-  int64_t strideW = 1;
-  if (stride.empty()) {
-    strideH = kernel_size[0];
-    strideW = kernel_size[1];
-  } else {
-    strideH = stride[0];
-    strideW = stride[1];
-  }
-
-  SmallVector<int64_t, N> kernelSize_t = {kernel_size[0], kernel_size[1]};
-  SmallVector<int64_t, N> strides_t = {strideH, strideW};
-  SmallVector<int64_t, N> paddings_t = {padding[0], padding[0], padding[1], padding[1]};
-  SmallVector<int64_t, N> dilations_t = {dilation[0], dilation[0], dilation[1], dilation[1]};
-
-  IntArrayRef kernelSize = IntArrayRef(kernelSize_t);
-  IntArrayRef strides = IntArrayRef(strides_t);
-  IntArrayRef paddings = IntArrayRef(paddings_t);
-  IntArrayRef dilations = IntArrayRef(dilations_t);
-  NPUAttrDesc npuAttrKsize = NPUAttrDesc("window", kernelSize);
-  NPUAttrDesc npuAttrStrides = NPUAttrDesc("stride", strides);
-  NPUAttrDesc npuAttrMode = NPUAttrDesc("mode", (int64_t) 0);
-  NPUAttrDesc npuAttrPadding = NPUAttrDesc("pad", paddings);
-  NPUAttrDesc npuAttrDilation = NPUAttrDesc("dilation", dilations);
-  NPUAttrDesc npuAttrGlobalPooling = NPUAttrDesc("global_pooling", false);
-  NPUAttrDesc npuAttrCeilmode = NPUAttrDesc("ceil_mode", (int64_t) !ceil_mode);
-
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrKsize,
-                                       npuAttrStrides,
-                                       npuAttrMode,
-                                       npuAttrPadding,
-                                       npuAttrDilation,
-                                       npuAttrGlobalPooling,
-                                       npuAttrCeilmode};
-
-  return attrs;
-}
-
-Tensor& quantized_max_pool2d_out_npu(
-    Tensor& output,
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode) {
-  // constructs the input and output NPUTensorDesc
-  auto inputs = quantized_max_pool2d_npu_input({self});
-  auto outputs = quantized_max_pool2d_npu_output({output});
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = quantized_max_pool2d_npu_attr(
-      kernel_size, stride, padding, dilation, ceil_mode);
-
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate(
-      "Pooling", inputs, outputs, attrs);
-
-  return output;
-}
-
-Tensor quantized_max_pool2d_npu(
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode) {
-  // calculate the output size
-  auto outputSizes = quantized_max_pool2d_npu_output_size(
-      self, kernel_size, stride, padding, dilation, ceil_mode);
-  
-  // construct the output tensor of the NPU
-  Tensor output = at::empty_with_format(
-      outputSizes, self.options(), ACL_FORMAT_NC1HWC0);
-
-  // calculate the output result of the NPU
-  quantized_max_pool2d_out_npu(
-      output, self, kernel_size, stride, padding, dilation, ceil_mode);
-  return output;
-}
-
-} // namespace native
-} // namespace at
diff --git a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
index c919f9cd027da080114d99707dc58187f159e3a9..d299fde881a3b0778ed46ca6777ea039a7de41fd 100644
--- a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
@@ -29,43 +29,31 @@ tuple<Tensor&, Tensor&> std_mean_out_npu_nocheck(
     bool unbiased, 
     bool keepdim) {
   // executing the NPU operator 
-  if (!c10::npu::OptionsManager::CheckDynamicEnable()) { 
-    OpCommand cmd;
-    cmd.Name("ReduceStd")
-        .Input(self)
-        .Output(resultStd)
-        .Output(resultMean)
-        .Attr("dim", dim)
-        .Attr("unbiased", unbiased)
-        .Attr("keepdim", keepdim)
-        .Run();
-  } else {
-    OpCommand cmd1;
-    cmd1.Name("ReduceMeanD")
-        .Input(self)
-        .Output(resultMean)
-        .Attr("axes", dim)
-        .Attr("keep_dims", keepdim)
-        .Run();
-    Tensor resultMeanCopy = resultMean;
-    if (resultMean.dim() != 0 && keepdim == false) {
-      auto dimVector = array_to_small_vector(dim);
-      std::sort(dimVector.begin(), dimVector.end());
-      for (int64_t i = 0; i < dimVector.size(); i++) {
-        resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]);
-      }
+  OpCommand cmd1;
+  cmd1.Name("ReduceMeanD")
+      .Input(self)
+      .Output(resultMean)
+      .Attr("axes", dim)
+      .Attr("keep_dims", keepdim)
+      .Run();
+  Tensor resultMeanCopy = resultMean;
+  if (resultMean.dim() != 0 && keepdim == false) {
+    auto dimVector = array_to_small_vector(dim);
+    std::sort(dimVector.begin(), dimVector.end());
+    for (int64_t i = 0; i < dimVector.size(); i++) {
+      resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]);
     }
-    resultMeanCopy = resultMeanCopy.expand(self.sizes());
-    OpCommand cmd2;
-    cmd2.Name("ReduceStdWithMean")
-        .Input(self)
-        .Input(resultMeanCopy)
-        .Output(resultStd)
-        .Attr("dim", dim)
-        .Attr("unbiased", unbiased)
-        .Attr("keepdim", keepdim)
-        .Run();
   }
+  resultMeanCopy = resultMeanCopy.expand(self.sizes());
+  OpCommand cmd2;
+  cmd2.Name("ReduceStdWithMean")
+      .Input(self)
+      .Input(resultMeanCopy)
+      .Output(resultStd)
+      .Attr("dim", dim)
+      .Attr("unbiased", unbiased)
+      .Attr("keepdim", keepdim)
+      .Run();
 
   return std::tie(resultStd, resultMean);
 }
diff --git a/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp
index ff95b80072a315ec489e165e9f09c83a878437be..4f1944c839e8ee0bc90d8a9a221a190600d2938c 100644
--- a/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/YoloBoxesEncodeKernelNpu.cpp
@@ -47,7 +47,7 @@ static inline void yolo_boxes_encode_check(
       "gt_bboxes num ",
       gt_bboxes.size(0));
   TORCH_CHECK(
-      at::isIntegralType(stride.scalar_type()) && stride.scalar_type() != ScalarType::Long,
+      at::isIntegralType(stride.scalar_type(), true) && stride.scalar_type() != ScalarType::Long,
       "int32 strdie tensor expected but got a tensor with dtype: ",
       stride.scalar_type());
 }
diff --git a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp
index 73dcdd8ec5cccb9633c455dee0a320dd05ea70cc..856903449f49cc5039d27cb643998ad2182845a4 100644
--- a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp
+++ b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp
@@ -16,11 +16,13 @@
 #include "OpParamMaker.h"
 #include <c10/npu/OptionsManager.h>
 #include "c10/npu/NPUQueue.h"
+#include "c10/npu/NPUCachingAllocator.h"
 #include <torch/csrc/autograd/record_function.h>
 #include "ATen/native/npu/aoe/AutoTune.h"
 #include "ATen/native/npu/utils/DynamicShapeUtil.h"
 #include "ATen/native/npu/utils/NpuFuzzyBlacklist.h"
 #include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
 #include "ATen/native/npu/interface/EnvVariables.h"
 
 namespace at {
@@ -161,19 +163,24 @@ aclError OpCommandImpl::InnerRun(string name, AclExecParam& params) {
     aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT);
     reset_flag = true;
   }
-  auto ret = aclopCompileAndExecute(
-    name.c_str(),
-    inputSize,
-    params.inDesc.data(),
-    params.inBuffer.data(),
-    outputSize,
-    params.outDesc.data(),
-    params.outBuffer.data(),
-    params.attr,
-    ACL_ENGINE_SYS,
-    ACL_COMPILE_SYS,
-    NULL,
-    stream);
+  aclError ret;
+  int index = 0;
+  do {
+    ret = aclopCompileAndExecute(
+      name.c_str(),
+      inputSize,
+      params.inDesc.data(),
+      params.inBuffer.data(),
+      outputSize,
+      params.outDesc.data(),
+      params.outBuffer.data(),
+      params.attr,
+      ACL_ENGINE_SYS,
+      ACL_COMPILE_SYS,
+      NULL,
+      stream);
+    ++index;
+  } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM));
   if (reset_flag) {
     aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ);
   }
@@ -194,7 +201,9 @@ int ExecFunc(void* in, aclrtStream stream) {
       aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT);
       reset_flag = true;
     }
-    ret = aclopCompileAndExecute(
+    int index = 0;
+    do {
+      ret = aclopCompileAndExecute(
         (cur_paras->opType).c_str(),
         cur_paras->paras.input_num,
         cur_paras->paras.input_desc,
@@ -207,6 +216,8 @@ int ExecFunc(void* in, aclrtStream stream) {
         ACL_COMPILE_SYS,
         nullptr,
         stream);
+      ++index;
+    } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM));
     if (reset_flag) {
       aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ);
     }
diff --git a/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp
index 46084f0740dceb4b9058a045cc90bd67e076c264..8cb984be1193c1d52c55bf9737b38f5eeda55812 100644
--- a/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/normalization/BatchNormBackwardKernelNpu.cpp
@@ -211,8 +211,8 @@ tuple<Tensor, Tensor, Tensor> batch_norm_backward_npu(
 
   // construct the output tensor of the NPU
   Tensor grad_input = OpPreparation::ApplyTensor(self_4d.sizes(), self_4d.options(), self_4d);
-  Tensor grad_weight = OpPreparation::ApplyTensor(weight_tensor.sizes(), weight_tensor.options(), weight_tensor);
-  Tensor grad_bias = OpPreparation::ApplyTensor(weight_tensor.sizes(), weight_tensor.options(), weight_tensor);
+  Tensor grad_weight = OpPreparation::ApplyTensor(weight_tensor, weight_tensor.options().dtype(ScalarType::Float));
+  Tensor grad_bias = OpPreparation::ApplyTensor(weight_tensor, weight_tensor.options().dtype(ScalarType::Float));
 
   // calculate the output result of the NPU
   batch_norm_backward_impl(
diff --git a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp
index aae01114039912bbbeb7f35270fc8362492c9264..0862fa5d7bda43a7dce5e3ac9bb115c58a29dc4b 100644
--- a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp
@@ -20,8 +20,8 @@ namespace at {
 namespace native {
 using namespace at::native::npu;
 
-Tensor& avg_pool2d_out_npu(
-    Tensor& out,
+Tensor& avg_pool2d_out_npu_nocheck(
+  Tensor& result,
     const Tensor& self,
     IntArrayRef kernel_size,
     IntArrayRef stride,
@@ -29,29 +29,74 @@ Tensor& avg_pool2d_out_npu(
     bool ceil_mode,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
-  string padding_str = ceil_mode ? "SAME" : "VALID";
+  if (padding.size() == 1) {
+    SmallVector<int64_t, SIZE> paddings = {padding[0], padding[0]};
+    padding = IntArrayRef(paddings);
+  }
 
+  // required attr
   int64_t strideH = 1;
   int64_t strideW = 1;
-
   if (!stride.empty()) {
     strideH = stride[0];
     strideW = stride[1];
   }
-
   SmallVector<int64_t, N> kernelSize = {1, 1, kernel_size[0], kernel_size[1]};
   SmallVector<int64_t, N> stridesSize = {1, 1, strideH, strideW};
+  SmallVector<int64_t, N> pads = {padding[0], padding[0], padding[1], padding[1]};
+
   OpCommand cmd;
-  cmd.Name("AvgPool")
-      .Input(self)
-      .Output(out)
-      .Attr("ksize", kernelSize)
-      .Attr("strides", stridesSize)
-      .Attr("padding", padding_str)
-      .Attr("data_format", (string)"NCHW")
-      .Run();
-
-  return out;
+  cmd.Name("AvgPoolV2")
+     .Input(self)
+     .Output(result)
+     .Attr("ksize", kernelSize)
+     .Attr("strides", stridesSize)
+     .Attr("padding_mode", (string)"CALCULATED")
+     .Attr("pads", pads)
+     .Attr("data_format", (string)"NCHW")
+     .Attr("global_pooling", false)
+     .Attr("ceil_mode", ceil_mode)
+     .Attr("exclusive", true)
+     .Run();
+
+  return result;
+}
+
+Tensor& avg_pool2d_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  auto outputSize = avg_pool2d_npu_output_size(
+      self,
+      kernel_size,
+      stride,
+      padding,
+      ceil_mode,
+      count_include_pad,
+      divisor_override);
+  
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self,
+      outputSize);
+  
+  avg_pool2d_out_npu_nocheck(
+      result,
+      self,
+      kernel_size,
+      stride,
+      padding,
+      ceil_mode,
+      count_include_pad,
+      divisor_override);
+
+  return result;
 }
 
 Tensor avg_pool2d_npu(
diff --git a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
index c5e2eba6ecc3bfdcad270781a578eb2eb01461e2..412d1fc32b7bca4bb8f5d7bcac31eee8458a5bc8 100644
--- a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
+++ b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
@@ -638,38 +638,46 @@ void CalcuOpUtil::execute_npu_operate(
   NPU_LOGD("Op %s aclopCompileAndExecute Run.", opName.c_str());
   if (PyGILState_Check()) {
     Py_BEGIN_ALLOW_THREADS
-    ACL_REQUIRE_OK_OP(
-        aclopCompileAndExecute(
-            opName.c_str(),
-            params.input_num,
-            params.input_desc,
-            params.input_data_buf,
-            params.output_num,
-            params.output_desc,
-            params.output_data_buf,
-            attr,
-            ACL_ENGINE_SYS,
-            ACL_COMPILE_SYS,
-            NULL,
-            stream),
-        opName.c_str());
+    aclError ret;
+    int index = 0;
+    do {
+      ret = aclopCompileAndExecute(
+        opName.c_str(),
+        params.input_num,
+        params.input_desc,
+        params.input_data_buf,
+        params.output_num,
+        params.output_desc,
+        params.output_data_buf,
+        attr,
+        ACL_ENGINE_SYS,
+        ACL_COMPILE_SYS,
+        NULL,
+        stream);
+      ++index;
+    } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM));
+    ACL_REQUIRE_OK_OP(ret, opName.c_str());
     Py_END_ALLOW_THREADS
   } else {
-    ACL_REQUIRE_OK_OP(
-        aclopCompileAndExecute(
-            opName.c_str(),
-            params.input_num,
-            params.input_desc,
-            params.input_data_buf,
-            params.output_num,
-            params.output_desc,
-            params.output_data_buf,
-            attr,
-            ACL_ENGINE_SYS,
-            ACL_COMPILE_SYS,
-            NULL,
-            stream),
-        opName.c_str());
+    aclError ret;
+    int index = 0;
+    do {
+      ret = aclopCompileAndExecute(
+        opName.c_str(),
+        params.input_num,
+        params.input_desc,
+        params.input_data_buf,
+        params.output_num,
+        params.output_desc,
+        params.output_data_buf,
+        attr,
+        ACL_ENGINE_SYS,
+        ACL_COMPILE_SYS,
+        NULL,
+        stream);
+      ++index;
+    } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM));
+    ACL_REQUIRE_OK_OP(ret, opName.c_str());
   }
   if (reset_flag) {
     aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ);
diff --git a/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp b/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp
index f4b42b2c2eb01a00efc247db0c1955f9e9e7609b..c972055dce6aa289c6a79813cd8823465fe56cfb 100644
--- a/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp
+++ b/src/aten/src/ATen/native/npu/utils/DynamicShapeUtil.cpp
@@ -17,6 +17,7 @@
 #include "DynamicShapeUtil.h"
 #include <Python.h>
 #include <unordered_set>
+#include "ATen/native/npu/utils/NpuUtils.h"
 #include "ATen/native/npu/dynamicstrategy/Strategy.h"
 #include "ATen/native/npu/frame/OpDynamicCmdHelper.h"
 #include "ATen/native/npu/frame/OpDynamicParamMaker.h"
@@ -331,7 +332,10 @@ aclError DynamicShapeUtil::ExecuteDynamic(
     ExecuteParas& cur_paras,
     aclrtStream stream) {
   auto params = OpDynamicCmdHelper::CreateDynamicRunParams(cur_paras);
-  return aclopExecuteV2(
+  aclError ret;
+  int index = 0;
+  do {
+    ret = aclopExecuteV2(
       std::get<0>(params).c_str(),
       std::get<1>(params),
       const_cast<aclTensorDesc**>(std::get<2>(params)),
@@ -341,6 +345,10 @@ aclError DynamicShapeUtil::ExecuteDynamic(
       std::get<6>(params),
       const_cast<aclopAttr*>(std::get<7>(params)),
       stream);
+    ++index;
+  } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM));
+
+  return ret;
 }
 
 void DynamicShapeUtil::staticCompileAndExecute(
@@ -349,9 +357,11 @@ void DynamicShapeUtil::staticCompileAndExecute(
     aclrtStream stream) {
   std::string opName = cur_paras.opType;
   NPU_LOGD(" Op %s aclopCompileAndExecute Run.", opName.c_str());
-  aclError ret;
   logUtil.SetStartTime();
-  ret = aclopCompileAndExecute(
+  aclError ret;
+  int index = 0;
+  do {
+    ret = aclopCompileAndExecute(
       opName.c_str(),
       cur_paras.paras.input_num,
       cur_paras.paras.input_desc,
@@ -364,7 +374,8 @@ void DynamicShapeUtil::staticCompileAndExecute(
       ACL_COMPILE_SYS,
       NULL,
       stream);
-
+    ++index;
+  } while(NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM));
   if (ret != 0) {
     C10_NPU_SHOW_ERR_MSG();
     logUtil.PrintLog(steps_, key, "Static Compile And Execute Failed");
diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
index 150711feecf70d648b02167e36b80eb5f29d18a1..773f25ab306403e43169c5a489625a18d9aea2a5 100644
--- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
+++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
@@ -603,7 +603,15 @@ SmallVector<int64_t, SIZE> nonzero_npu_output_size(const Tensor& self){
   int64_t dim = self.dim();
   Tensor boolSelf = self.npu_dtype_cast(ScalarType::Bool);
   Tensor intSelf  = boolSelf.npu_dtype_cast(ScalarType::Int);
-  Tensor coutNonzeroSelf = at::sum(intSelf, ScalarType::Int);
+
+  Tensor coutNonzeroSelf = intSelf;
+  if (self.numel() > 10000000) {
+    //Ensure outputsize correctly in large shape case
+    coutNonzeroSelf = at::sum(intSelf, ScalarType::Long);
+  } else {
+    coutNonzeroSelf = at::sum(intSelf, ScalarType::Int);
+  }
+
   int64_t nonzeroNum = coutNonzeroSelf.item().toInt(); 
   SmallVector<int64_t, SIZE> outputSize = {nonzeroNum, dim};
   return outputSize;   
diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp
index daa10e3ab1c6e331dc08102e8d7894a3a2ba8f16..368b1478c02da837f59b247dd7cdc7373effc36a 100644
--- a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp
+++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp
@@ -16,7 +16,6 @@
 
 #include <mutex>
 #include "NpuUtils.h"
-#include "c10/npu/NPUCachingAllocator.h"
 #include "c10/npu/register/OptionRegister.h"
 
 #include "CalcuOpUtil.h"
@@ -312,7 +311,20 @@ Tensor NpuUtils::format_contiguous_add_copy_optimize(const Tensor& src) {
   return src;
 }
 
-
+bool NpuUtils::IsOomError(aclError ret, int index)
+{
+  if (ret == ACL_ERROR_GE_DEVICE_MEMORY_ALLOCATION_FAILED) {
+    int deviceId = 0;
+    // free devcie cached memory when return value of the first op execution is oom
+    if (index == 1) {
+      C10_NPU_CHECK(aclrtGetDevice(&deviceId));
+      c10::npu::NPUCachingAllocator::FreeDeviceCachedMemory(deviceId);
+      return true;
+    }
+    AT_ERROR("NPU out of memory. device id: ", deviceId);
+  }
+  return false;
+}
 } // namespace npu
 } // namespace native
 } // namespace at
diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.h b/src/aten/src/ATen/native/npu/utils/NpuUtils.h
index d9797e289977defac21ded7f2ed0793debf6ec5c..34849d55cad36c5333a20af8bc900313b8c8c2a4 100644
--- a/src/aten/src/ATen/native/npu/utils/NpuUtils.h
+++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.h
@@ -18,10 +18,12 @@
 #define __NATIVE_NPU_UTILS_NUP_UTILS__
 
 #include <stdint.h>
+#include "c10/npu/NPUCachingAllocator.h"
 #include <third_party/acl/inc/acl/acl.h>
 #include <third_party/acl/inc/acl/acl_base.h>
 #include <third_party/acl/inc/acl/acl_op_compiler.h>
 #include <third_party/acl/inc/acl/acl_op.h>
+#include <third_party/acl/inc/ge/ge_error_codes.h>
 #include <string>
 #include <vector>
 #include "ATen/ATen.h"
@@ -41,6 +43,7 @@ const int SHAPE_SIZE = 8;
 // HALF_MAX and HALF_MIN of NPU support
 const int NPU_HALF_MAX = 65504;
 const int NPU_HALF_MIN = -65504;
+const int NPU_MAX_OP_EXEC_TRY_NUM = 2;
 
 typedef enum MemoryType{
   MEMORY_DEVICE,
@@ -59,6 +62,7 @@ class NpuUtils {
       const Tensor& y);
 
   static bool check_5d_5d_match(const Tensor& tensor);
+  static bool IsOomError(aclError ret, int index);
 };
 } // namespace npu
 } // namespace native
diff --git a/src/c10/npu/NPUCachingAllocator.cpp b/src/c10/npu/NPUCachingAllocator.cpp
index f179b6b23d50fbe7d6e31cc2c8109259305d78e9..cbda658019122640baf163d0297ae41e2f6819a6 100644
--- a/src/c10/npu/NPUCachingAllocator.cpp
+++ b/src/c10/npu/NPUCachingAllocator.cpp
@@ -441,6 +441,7 @@ struct THNCachingAllocator {
   void emptyCache() {
     std::lock_guard<std::recursive_mutex> lock(mutex);
     synchronize_and_free_events(nullopt);
+    c10::npu::npuSynchronizeDevice();
     free_blocks(large_blocks, large_blocks.begin(), large_blocks.end());
     free_blocks(small_blocks, small_blocks.begin(), small_blocks.end());
   }
@@ -774,6 +775,7 @@ struct THNCachingAllocator {
     Block lower_bound(device, nullptr, 0);
     Block upper_bound(device + 1, nullptr, 0);
 
+    c10::npu::npuSynchronizeDevice();
     free_blocks(
         large_blocks,
         large_blocks.lower_bound(&lower_bound),
@@ -1195,6 +1197,11 @@ void raw_delete(void* ptr) {
   caching_allocator.free(ptr);
 }
 
+void FreeDeviceCachedMemory(int device)
+{
+  caching_allocator.free_cached_blocks(device);
+
+}
 } // namespace NPUCachingAllocator
 
 } // namespace npu
diff --git a/src/c10/npu/NPUCachingAllocator.h b/src/c10/npu/NPUCachingAllocator.h
index 4c38309b4be18e723fda8353112af5d47510d2d8..5388f7bb5ecf20f7c81c54f87440f8b18eb107f8 100644
--- a/src/c10/npu/NPUCachingAllocator.h
+++ b/src/c10/npu/NPUCachingAllocator.h
@@ -143,6 +143,7 @@ C10_NPU_API std::mutex* getFreeMutex();
 
 C10_NPU_API std::shared_ptr<void> getIpcDevPtr(std::string handle);
 
+C10_NPU_API void FreeDeviceCachedMemory(int device);
 } // namespace NPUCachingAllocator
 
 } // namespace npu
diff --git a/src/third_party/acl/inc/ge/ge_error_codes.h b/src/third_party/acl/inc/ge/ge_error_codes.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0f8644463c9a7cc7b760ef184b5d37869e9d605
--- /dev/null
+++ b/src/third_party/acl/inc/ge/ge_error_codes.h
@@ -0,0 +1,76 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_EXTERNAL_GE_GE_ERROR_CODES_H_
+#define INC_EXTERNAL_GE_GE_ERROR_CODES_H_
+
+#if defined(_MSC_VER)
+#ifdef FUNC_VISIBILITY
+#define GE_FUNC_VISIBILITY _declspec(dllexport)
+#else
+#define GE_FUNC_VISIBILITY
+#endif
+#else
+#ifdef FUNC_VISIBILITY
+#define GE_FUNC_VISIBILITY __attribute__((visibility("default")))
+#else
+#define GE_FUNC_VISIBILITY
+#endif
+#endif
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+static const uint32_t ACL_ERROR_GE_PARAM_INVALID = 145000;
+static const uint32_t ACL_ERROR_GE_EXEC_NOT_INIT = 145001;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID = 145002;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ID_INVALID = 145003;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID = 145006;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ADDR_INVALID = 145007;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID = 145008;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_REPEATED = 145009;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_ADDR_INVALID = 145011;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_LENGTH_INVALID = 145012;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_BATCH_SIZE_INVALID = 145013;
+static const uint32_t ACL_ERROR_GE_AIPP_BATCH_EMPTY = 145014;
+static const uint32_t ACL_ERROR_GE_AIPP_NOT_EXIST = 145015;
+static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016;
+static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017;
+static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018;
+static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019;
+static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020;
+static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021;
+static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022;
+static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000;
+static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001;
+static const uint32_t ACL_ERROR_GE_DEVICE_MEMORY_ALLOCATION_FAILED = 245002;
+static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000;
+static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED = 545003;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED = 545004;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_KERNEL_PARTITION_FAILED = 545005;
+static const uint32_t ACL_ERROR_GE_EXEC_RELEASE_MODEL_DATA = 545006;
+static const uint32_t ACL_ERROR_GE_COMMAND_HANDLE = 545007;
+static const uint32_t ACL_ERROR_GE_GET_TENSOR_INFO = 545008;
+static const uint32_t ACL_ERROR_GE_UNLOAD_MODEL = 545009;
+
+#ifdef __cplusplus
+}  // namespace ge
+#endif
+#endif  // INC_EXTERNAL_GE_GE_ERROR_CODES_H_
diff --git a/src/tools/autograd/derivatives.yaml b/src/tools/autograd/derivatives.yaml
index 046aad5032c2ef0e38c53ab8b859e2703fd5cf9d..1db83b1c5a6a2870f5721b3d2483ec24b45e2ab3 100644
--- a/src/tools/autograd/derivatives.yaml
+++ b/src/tools/autograd/derivatives.yaml
@@ -1644,7 +1644,7 @@
 - name: nonzero(Tensor self) -> Tensor
   output_differentiability: [False]
 
-- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+- name: npu_lstm(Tensor input, Tensor weight, Tensor bias, Tensor seqMask, Tensor h, Tensor c, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, bool flagSeq, bool direction) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
   output_differentiability: [True, True, True, False, False, False, False, False]
   input, weight, bias, h, c: npu_lstm_backward(grads[0], grads[1], grads[2], input, weight, bias, h, c, result0, result1, result2, result3, result4, result5, result6, result7)
 
@@ -1687,4 +1687,8 @@
   input, weight, offset, bias: npu_deformable_conv2dbk(input, grad, result1, weight, offset, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated)
 
 - name: npu_mish(Tensor self) -> Tensor
-  self: npu_mish_backward(grad, self)
\ No newline at end of file
+  self: npu_mish_backward(grad, self)
+
+- name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+  input, weight: npu_linear_backward(grad, input, weight)
+  bias: maybe_multiply(grad, 1)
\ No newline at end of file
diff --git a/test/test_npu/test_avg_pool2d_backward.py b/test/test_npu/test_avg_pool2d_backward.py
index 28beefd1f72eba3b0535bf268c5dd90074d724c8..62cf003d92d98adfb3c0a4e32e357b59ff34138e 100644
--- a/test/test_npu/test_avg_pool2d_backward.py
+++ b/test/test_npu/test_avg_pool2d_backward.py
@@ -56,7 +56,7 @@ class TestAvgPool2dBackward(TestCase):
                 [np.float32, 0, (64, 10, 16, 14)],
                 [np.float32, 3, (256, 2048, 8, 8)],
                 [np.float32, 4, (32, 1, 2, 2)],
-                [np.float32, 29, (10, 128, 16, 16)]
+                [np.float32, 0, (10, 128, 16, 16)]
         ]
         for item in shape_format:
             cpu_input, npu_input = create_common_tensor(item, 1, 100)
diff --git a/test/test_npu/test_histc.py b/test/test_npu/test_histc.py
deleted file mode 100644
index 7a80dc5b7af52e3d1af051ffcac01f0e824602a2..0000000000000000000000000000000000000000
--- a/test/test_npu/test_histc.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestHistc(TestCase):
-    def generate_single_data(self, min_d, max_d, shape, dtype): 
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype) 
-        npu_input = torch.from_numpy(input1) 
-         
-        return npu_input 
-        
-    def cpu_op_exec(self, input1, bins=100, min=0, max=0): 
-        output = torch.histc(input1, bins=bins, min=min, max=max)
-        output = output.numpy() 
-        return output 
-    
-    def npu_op_exec(self, input1, bins=100, min=0, max=0): 
-        input1 = input1.to("npu") 
-        output = torch.histc(input1, bins=bins, min=min, max=max)
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output 
-
-    def test_histc_int32_1(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (1000,), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        cpu_output = cpu_output.astype(np.int32)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_histc_int32_2(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (20, 30, 2), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        cpu_output = cpu_output.astype(np.int32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_histc_int32_3(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10000,), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        cpu_output = cpu_output.astype(np.int32)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_histc_int32_4(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10000,), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=5000, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=5000, min=0, max=100)
-        cpu_output = cpu_output.astype(np.int32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_histc_int32_5(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (1000,), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50)
-        npu_output = self.npu_op_exec(npu_input1, bins=50)
-        cpu_output = cpu_output.astype(np.int32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_histc_float32_1(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (1000,), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_histc_float32_2(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (20, 30, 2), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_histc_float32_3(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10000,), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_histc_float32_4(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (1000,), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, bins=5000, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=5000, min=0, max=100)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_histc_float32_5(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (1000,), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, bins=50)
-        npu_output = self.npu_op_exec(npu_input1, bins=50)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_histc_float16_1(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (1000,), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        cpu_output = cpu_output.astype(np.float16)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_histc_float16_2(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (20, 30, 2), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        cpu_output = cpu_output.astype(np.float16)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_histc_float16_3(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10000,), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=50, min=0, max=100)
-        cpu_output = cpu_output.astype(np.float16)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_histc_float16_4(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10000,), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=5000, min=0, max=100)
-        npu_output = self.npu_op_exec(npu_input1, bins=5000, min=0, max=100)
-        cpu_output = cpu_output.astype(np.float16)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_histc_float16_5(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (1000,), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1.to(torch.float), bins=50)
-        npu_output = self.npu_op_exec(npu_input1, bins=50)
-        cpu_output = cpu_output.astype(np.float16)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-
-instantiate_device_type_tests(TestHistc, globals(), except_for='cpu')  
-if __name__ == '__main__':
-    torch.npu.set_device("npu:1")
-    run_tests()
diff --git a/test/test_npu/test_mode.py b/test/test_npu/test_mode.py
deleted file mode 100644
index be599e22abe599d2ab1a9338cfb7f8cebe0b81ea..0000000000000000000000000000000000000000
--- a/test/test_npu/test_mode.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import numpy as np
-import torch
-import torch.nn as nn
-from common_device_type import dtypes, instantiate_device_type_tests
-from common_utils import TestCase, run_tests
-from util_test import create_common_tensor
-
-#pylint: disable=import-error
-#pylint: disable=too-many-lines
-#pylint: disable=too-many-arguments
-#pylint: disable=unused-variable
-#pylint: disable=unused-argument
-
-class TestMode(TestCase):
-
-    def generate_data_1(self, dtype):
-        input = np.array([[10, 11, 12, 11, 10, 10, 10, 11],
-                          [11, 11, 11, 10, 11, 10, 10, 11],
-                          [12, 10, 10, 12, 10, 11, 10, 13],
-                          [12, 10, 11, 12, 11, 11, 10, 13],
-                          [14, 11, 11, 12, 10, 11, 10, 13]]).astype(np.float32)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input = torch.from_numpy(input)
-        return npu_input
-
-    def generate_data_2(self, dtype):
-        input = np.array([[10, 11, 12, 11, 10],
-                          [11, 10, 11, 10, 11],
-                          [12, 10, 10, 12, 10],
-                          [12, 10, 11, 13, 11],
-                          [14, 11, 11, 12, 10]]).astype(np.float32)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input = torch.from_numpy(input)
-        return npu_input
-    
-    
-    def generate_data_3(self, dtype):
-         input = np.zeros((36,25)).astype(np.float32)
-         input[:,2]=1
-         input[:,12]=2
-         # modify from numpy.ndarray to torch.tensor
-         npu_input = torch.from_numpy(input)
-         return npu_input
-    
-    def generate_data_4(self, dtype):
-        input = np.zeros((12,12)).astype(np.float32)
-        # modify from numpy.ndarray to torch.tensor
-        input[:,2]=1
-        input[:,10]=2
-        npu_input = torch.from_numpy(input)
-        return npu_input
-        
-    def cpu_op_exec_0(self, input):
-        output1, output2 = torch.mode(input, 0, keepdim=False)
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-
-    def npu_op_exec_0(self, input):
-        input = input.to("npu")
-        output1, output2 = torch.mode(input, 0, keepdim=False)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-
-    def cpu_op_exec_1(self, input):
-        output1, output2 = torch.mode(input, 1, keepdim=False)
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-        
-    def npu_op_exec_1(self, input):
-        input = input.to("npu")
-        output1, output2 = torch.mode(input, 1, keepdim=False)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-        
-    def _cpu_op_exec_0(self, input):
-        output1, output2 = torch._mode(input, 0, keepdim=False)
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-    
-    def _npu_op_exec_0(self, input):
-        input = input.to("npu")
-        output1, output2 = torch._mode(input, 0, keepdim=False)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-        
-    def _cpu_op_exec_1(self, input):
-        output1, output2 = torch._mode(input, 1, keepdim=False)
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-    
-    def _npu_op_exec_1(self, input):
-        input = input.to("npu")
-        output1, output2 = torch._mode(input, 1, keepdim=False)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-
-        return output1, output2
-
-    def test_add_float32_0(self, device):
-        npu_input = self.generate_data_1(np.float32)
-        cpu_output1, cpu_output2 = self.cpu_op_exec_0(npu_input)
-        npu_output1, npu_output2 = self.npu_op_exec_0(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_add_float32_1(self, device):
-        npu_input = self.generate_data_2(np.float32)
-        cpu_output1, cpu_output2 = self.cpu_op_exec_1(npu_input)
-        npu_output1, npu_output2 = self.npu_op_exec_1(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_add_float32_2(self, device):
-        npu_input = self.generate_data_3(np.float32)
-        cpu_output1, cpu_output2 = self.cpu_op_exec_0(npu_input)
-        npu_output1, npu_output2 = self.npu_op_exec_0(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_add_float32_3(self, device):
-        npu_input = self.generate_data_4(np.float32)
-        cpu_output1, cpu_output2 = self.cpu_op_exec_0(npu_input)
-        npu_output1, npu_output2 = self.npu_op_exec_0(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-        
-    def test_add_float32_4(self, device):
-        npu_input = self.generate_data_2(np.float32)
-        cpu_output1, cpu_output2 = self._cpu_op_exec_0(npu_input)
-        npu_output1, npu_output2 = self._npu_op_exec_0(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-        
-    def test_add_float32_5(self, device):
-        npu_input = self.generate_data_1(np.float32)
-        cpu_output1, cpu_output2 = self._cpu_op_exec_1(npu_input)
-        npu_output1, npu_output2 = self._npu_op_exec_1(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-    
-    def test_add_float32_6(self, device):
-        npu_input = self.generate_data_3(np.float32)
-        cpu_output1, cpu_output2 = self._cpu_op_exec_0(npu_input)
-        npu_output1, npu_output2 = self._npu_op_exec_0(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-        
-    def test_add_float32_7(self, device):
-        npu_input = self.generate_data_4(np.float32)
-        cpu_output1, cpu_output2 = self._cpu_op_exec_1(npu_input)
-        npu_output1, npu_output2 = self._npu_op_exec_1(npu_input)
-        self.assertRtolEqual(cpu_output1, npu_output1)
-        self.assertRtolEqual(cpu_output2, npu_output2)
-        
-instantiate_device_type_tests(TestMode, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
-    run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_adaptive_avg_pool1d.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py
similarity index 88%
rename from test/test_npu/test_adaptive_avg_pool1d.py
rename to test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py
index 44f92176967bb2cf65207fe21085a76d7b12b592..662cae2af3231941d335f5aa27f24af512b06a66 100644
--- a/test/test_npu/test_adaptive_avg_pool1d.py
+++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py
@@ -33,21 +33,21 @@ class TestAdaptiveAvgPool1d(TestCase):
     def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
         shape_format = [
                 [np.float16, 0, (64, 10, 16)],
-                [np.float16, 1, (256, 2048, 8)],
+                [np.float16, -1, (256, 2048, 8)],
                 [np.float16, 3, (32, 16, 16)]
         ]
-        output_list = [(4), (3), (1)]
+        output_list = [(4), (3)]
         for item in shape_format:
             cpu_input, npu_input = create_common_tensor(item, 1, 10)
             for output_size in output_list:
                 cpu_output = self.cpu_op_exec(cpu_input, output_size)
                 npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output)
+                self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
 
     def test_AdaptiveAvgPool1d_shape_format_fp32(self, device):
         shape_format = [
                 [np.float32, 0, (64, 10, 16)],
-                [np.float32, 1, (256, 2048, 8)],
+                [np.float32, -1, (256, 2048, 8)],
                 [np.float32, 3, (32, 16, 16)]
         ]
         output_list = [(4), (3), (1)]
@@ -56,11 +56,8 @@ class TestAdaptiveAvgPool1d(TestCase):
             for output_size in output_list:
                 cpu_output = self.cpu_op_exec(cpu_input, output_size)
                 npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output)
+                self.assertRtolEqual(cpu_output, npu_output, 0.001)
 
 instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
-    
-
diff --git a/test/test_npu/test_adaptive_avg_pool3d.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py
similarity index 85%
rename from test/test_npu/test_adaptive_avg_pool3d.py
rename to test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py
index 937e3eac27ed984374ee53543791c616d3a86564..859cccf3cbb1470177b87472a518fe4d0c06f870 100644
--- a/test/test_npu/test_adaptive_avg_pool3d.py
+++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py
@@ -32,10 +32,10 @@ class TestAdaptiveAvgPool3d(TestCase):
     
     def test_AdaptiveAvgPool3d_shape_format_fp16(self, device):
         shape_format = [
-                [np.float16, 0, (64, 10, 16, 32)],
-                [np.float16, 0, (4, 16, 8, 4, 2)],
-                [np.float16, 29, (2, 16, 4, 32)],
-                [np.float16, 29, (4, 16, 8, 4, 16)]
+                [np.float16, -1, (64, 10, 16, 32)],
+                [np.float16, -1, (4, 16, 8, 4, 2)],
+                [np.float16, -1, (2, 16, 4, 32)],
+                [np.float16, -1, (4, 16, 8, 4, 16)]
         ]
         # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)]
         output_list = [(1, 1, 1)]
@@ -50,10 +50,10 @@ class TestAdaptiveAvgPool3d(TestCase):
 
     def test_AdaptiveAvgPool3d_shape_format_fp32(self, device):
         shape_format = [
-                [np.float32, 0, (64, 10, 16, 32)],
-                [np.float32, 0, (4, 2, 2, 4, 316)],
-                [np.float32, 29, (2, 16, 4, 32)],
-                [np.float32, 29, (4, 16, 8, 4, 16)]
+                [np.float32, -1, (64, 10, 16, 32)],
+                [np.float32, -1, (4, 2, 2, 4, 316)],
+                [np.float32, -1, (2, 16, 4, 32)],
+                [np.float32, -1, (4, 16, 8, 4, 16)]
         ]
         # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)]
         output_list = [(1, 1, 1)]
@@ -67,5 +67,3 @@ class TestAdaptiveAvgPool3d(TestCase):
 instantiate_device_type_tests(TestAdaptiveAvgPool3d, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
-    
-
diff --git a/test/test_npu/test_adaptive_avg_pool3d_backward.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py
similarity index 98%
rename from test/test_npu/test_adaptive_avg_pool3d_backward.py
rename to test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py
index 2948fa9549e0788a29530198e0e945079d91f51d..c3dc9a48430dbc337faa1ac4895b7563883584e2 100644
--- a/test/test_npu/test_adaptive_avg_pool3d_backward.py
+++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py
@@ -42,7 +42,7 @@ class TestAdaptiveAvgPool3dBackward(TestCase):
 
     def test_adaptiveAvgPool3d_backward(self, device):
         dtype_list = [np.float16, np.float32]
-        format_list = [0, 29]
+        format_list = [-1]
         shape_list = [
             [2, 3, 7, 7],
             [1, 2, 3, 6, 6],
diff --git a/test/test_npu/test_adaptive_max_pool2d.py b/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py
similarity index 91%
rename from test/test_npu/test_adaptive_max_pool2d.py
rename to test/test_npu/test_network_ops/test_adaptive_max_pool2d.py
index b807a569a7987408728258d3912dc96d478a896a..877f50c11c26fb787a491cae9fcfc7b2957db0a9 100644
--- a/test/test_npu/test_adaptive_max_pool2d.py
+++ b/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py
@@ -32,9 +32,9 @@ class TestAdaptiveMaxPool2d(TestCase):
         return output.cpu().numpy()
 
     def test_adaptiveMaxPool2d_shape_format_fp32_6(self, device):
-        format_list = [0, 3]
-        shape_list = [(1, 5, 9, 9),
-                      (1, 8, 9)]
+        format_list = [-1]
+        # (1, 8, 9) IndexError
+        shape_list = [(1, 5, 9, 9)]
         shape_format = [
             [np.float32, i, j] for i in format_list for j in shape_list
         ]
@@ -44,7 +44,8 @@ class TestAdaptiveMaxPool2d(TestCase):
             for output_size in output_list:
                 cpu_output = self.cpu_op_exec(cpu_input, output_size)
                 npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output)
+
+                self.assertRtolEqual(cpu_output, npu_output, 0.0004)
 
 
 instantiate_device_type_tests(TestAdaptiveMaxPool2d, globals(), except_for="cpu")
diff --git a/test/test_npu/test_quantized_max_pool2d.py b/test/test_npu/test_network_ops/test_avg_pool2d.py
similarity index 32%
rename from test/test_npu/test_quantized_max_pool2d.py
rename to test/test_npu/test_network_ops/test_avg_pool2d.py
index 4f4be2ee747ec92abc063b6475118a0e205a1175..6042069f8de3840e11a894f355da879a2319098a 100644
--- a/test/test_npu/test_quantized_max_pool2d.py
+++ b/test/test_npu/test_network_ops/test_avg_pool2d.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import torch
 import torch.nn as nn
 import numpy as np
@@ -20,47 +19,45 @@ from common_device_type import dtypes, instantiate_device_type_tests
 from util_test import create_common_tensor
 
 
-class TestQuantizedMaxPool2d(TestCase):
-    def cpu_op_exec(self, input, kernel_size, stride, padding, dilation, ceil_mode):
-        output = nn.quantized.functional.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
-        return output.numpy()
+class TestAvgPool2d(TestCase):
+    def cpu_op_exec(self, input, ceil_mode):
+        m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode)
+        output = m(input)
+        output = output.detach().numpy()
+        return output
 
-    def npu_op_exec(self, input, ksize, stride, padding, dilation, ceil_mode):
-        output = nn.quantized.functional.max_pool2d(input, ksize, stride, padding, dilation, ceil_mode)
-        return output.cpu().numpy()
+    def npu_op_exec(self, input, ceil_mode):
+        m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode).npu()
+        output = m(input)
+        output = output.to("cpu")
+        output = output.detach().numpy()
+        return output
 
-    def test_quantized_max_pool2d_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [(32, 16, 16, 16),
-                      (16, 1024, 256, 20),
-                      (1024, 464, 11, 9),
-                      (1, 2048, 15, 15)]
-        ksize_list = [(2, 2), (3, 3)]
-        stride_list = [(1, 1), (2, 2)]
-        padding_list = [(0, 0), (1, 1)]
-        dilation_list = [1]
-        ceil_mode_list = [False, True]
+    def test_avg_pool2d_backward_shape_format_fp16(self, device):
         shape_format = [
-            [np.float16, i, j, k, m, n, o, p] for i in format_list for j in shape_list for k in ksize_list for m in stride_list for n in padding_list for o in dilation_list for p in ceil_mode_list
+            [[np.float16, 0, (1, 3, 147, 147)], True],
+            [[np.float16, 0, (1, 3, 147, 147)], True]
         ]
-        # TODO(Ascend): tbe operator has problem in precision and (x, 1) case and so on.
+        
         for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            #npu_input = cpu_input
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
             cpu_input = cpu_input.to(torch.float32)
-            qu = torch.nn.quantized.Quantize(1.0, 50, torch.qint8)
-            cpu_input = qu(cpu_input)
-            npu_input = qu(npu_input)
-            #npu_input.to("npu")
-            cpu_output = self.cpu_op_exec(cpu_input, item[3], item[4], item[5], item[6], ceil_mode=item[7])
-            print(item)
-            print(cpu_output.shape)
-            npu_output = self.npu_op_exec(npu_input, item[3], item[4], item[5], item[6], ceil_mode=item[7])
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
+            cpu_output = self.cpu_op_exec(cpu_input.float(), item[1]).astype(np.float16)
+            npu_output = self.npu_op_exec(npu_input, item[1])
+            self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
 
+    def test_avg_pool2d_backward_shape_format_fp32(self, device):
+        shape_format = [
+            [[np.float32, 0, (1, 3, 147, 147)], True],
+            [[np.float32, 0, (1, 3, 147, 147)], True]
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, item[1])
+            npu_output = self.npu_op_exec(npu_input, item[1])
+            self.assertRtolEqual(cpu_output, npu_output, 0.0009)
 
-instantiate_device_type_tests(TestQuantizedMaxPool2d, globals(), except_for="cpu")
+instantiate_device_type_tests(TestAvgPool2d, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
diff --git a/test/test_npu/test_cummin.py b/test/test_npu/test_network_ops/test_cummin.py
similarity index 99%
rename from test/test_npu/test_cummin.py
rename to test/test_npu/test_network_ops/test_cummin.py
index 52938df20aa7cc3b3a225b7b5b41e603a089a811..7118dd940b84fb6a23cb06a7553aa457c3a95a74 100644
--- a/test/test_npu/test_cummin.py
+++ b/test/test_npu/test_network_ops/test_cummin.py
@@ -51,7 +51,7 @@ class TestCummin(TestCase):
     def npu_op_exec_out(self, input_x, dim, output_value, output_argmin):
         input_x = input_x.to("npu")
         output_value = output_value.to("npu")
-        output_argmin = output_argmin.to("npu")
+        output_argmin = output_argmin.to("npu").to(torch.long)
         torch.cummin(input_x, dim, out=(output_value, output_argmin))
         output_value = output_value.to("cpu")
         output_value = output_value.numpy()
diff --git a/test/test_npu/test_network_ops/test_gt.py b/test/test_npu/test_network_ops/test_gt.py
index 1f55b3581022898e760e47024fac1daf41fcaecf..d3ec28991001811d22a6eda7da3cb86b7ee4aa02 100644
--- a/test/test_npu/test_network_ops/test_gt.py
+++ b/test/test_npu/test_network_ops/test_gt.py
@@ -142,6 +142,24 @@ class TestGt(TestCase):
 
             self.assertRtolEqual(cpu_output_out, npu_output_out)
 
+    def test_gt_bool(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        scalar_list = [True, False]
+        shape_format = [
+            [[np.int32, i, j], k] for i in format_list for j in shape_list 
+            for k in scalar_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1])
+            npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
+            cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
+            npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
+            self.assertEqual(cpu_output1, npu_output1)
+            self.assertEqual(cpu_output2, npu_output2)
+
     def test_gt_tensor_out(self, device):
         shape_format = [
             [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
diff --git a/test/test_npu/test_network_ops/test_index_copy.py b/test/test_npu/test_network_ops/test_index_copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b29510402dee189f77f237b9965dc546a24f791
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_index_copy.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+
+class TestIndexCopy(TestCase):
+    def op_exec(self, npuflag, input1, dim, indices, updates):
+        output = torch.index_copy(input1, dim, indices, updates)
+        if npuflag:
+            output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def op_inp_exec(self, npuflag, input1, dim, indices, updates):
+        input1.index_copy_(dim, indices, updates)
+        if npuflag:
+            input1 = input1.to("cpu")
+        output = input1.numpy()
+        return output
+
+    def op_inp_exec_(self, npuflag, input1, dim, indices, updates):
+        input1 = torch._index_copy_(input1, dim, indices, updates)
+        if npuflag:
+            input1 = input1.to("cpu")
+        output = input1.numpy()
+        return output
+
+    def case_exec(self, input1, dim, indices, updates):
+        npu_input = input1.npu()
+        npu_indices = indices.npu()
+        npu_updates = updates.npu()
+        cpu_output = self.op_exec(0, input1, dim, indices, updates)
+        npu_output = self.op_exec(1, npu_input, dim, npu_indices, npu_updates)
+        self.assertEqual(cpu_output, npu_output)
+        cpu_output = self.op_inp_exec(0, input1, dim, indices, updates)
+        npu_output = self.op_inp_exec(1, npu_input, dim, npu_indices, npu_updates)
+        self.assertEqual(cpu_output, npu_output)
+        cpu_output = self.op_inp_exec_(0, input1, dim, indices, updates)
+        npu_output = self.op_inp_exec_(1, npu_input, dim, npu_indices, npu_updates)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_index_copy_dim0_0(self, device):
+        a = torch.ones(5, dtype = torch.float32)
+        indices = torch.LongTensor([3, 2, 1, 0])
+        updates = torch.tensor([1, 2, 3, 4], dtype = torch.float32)
+        self.case_exec(a, 0, indices, updates)
+
+    def test_index_copy_dim0_1(self, device):
+        a = torch.ones(5, 3, dtype = torch.float32)
+        indices = torch.LongTensor([0, 1, 2])
+        updates = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype = torch.float32)
+        self.case_exec(a, 0, indices, updates)
+
+    def test_index_copy_dim0_2(self, device):
+        a = torch.ones(2, 5, 3, dtype = torch.float32)
+        indices = torch.LongTensor([0])
+        updates = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]], dtype = torch.float32)
+        self.case_exec(a, 0, indices, updates)
+
+    def test_index_copy_dim1_0(self, device):
+        a = torch.ones(5, 3, dtype = torch.float32)
+        indices = torch.LongTensor([0, 1])
+        updates = torch.tensor([[1, 2], [5, 6], [8, 9], [3, 4], [0, 1]], dtype = torch.float32)
+        self.case_exec(a, 1, indices, updates)
+
+    def test_index_copy_dim1_1(self, device):
+        a = torch.ones(2, 5, 3, dtype = torch.float32)
+        indices = torch.LongTensor([0])
+        updates = torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype = torch.float32)
+        self.case_exec(a, 1, indices, updates)
+
+    def test_index_copy_dim2_0(self, device):
+        a = torch.ones(2, 5, 3, dtype = torch.float32)
+        indices = torch.LongTensor([0])
+        updates = torch.tensor([[[1], [2], [3], [4], [5]],
+                                [[6], [7], [8], [9], [0]]], dtype = torch.float32)
+        self.case_exec(a, 2, indices, updates)
+
+    def test_index_copy_dim2_1(self, device):
+        a = torch.ones(2, 5, 3, dtype = torch.float32)
+        indices = torch.LongTensor([0, 1])
+        updates = torch.tensor([[[3, 2], [1, 2], [1, 3], [1, 4], [1, 5]],
+                                [[1, 6], [1, 7], [1, 8], [1, 9], [1, 0]]], dtype = torch.float32)
+        self.case_exec(a, 2, indices, updates)
+
+instantiate_device_type_tests(TestIndexCopy, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_lerp.py b/test/test_npu/test_network_ops/test_lerp.py
similarity index 97%
rename from test/test_npu/test_lerp.py
rename to test/test_npu/test_network_ops/test_lerp.py
index fc577185b0493d0972db304db4d22f6007c9de42..eeb0b7250506026e724a90a5578ecc805cab80b1 100644
--- a/test/test_npu/test_lerp.py
+++ b/test/test_npu/test_network_ops/test_lerp.py
@@ -167,7 +167,7 @@ class TestLerp(TestCase):
             return output
 
         shape_format = [
-            [[np.float16, -1, (100, 4, 5, 5)], 1,2],
+            [[np.float16, -1, (100, 4, 5, 5)], 1.2],
             [[np.float16, -1, (100, 5, 5, 4)], 1.2],
         ]
 
@@ -178,7 +178,7 @@ class TestLerp(TestCase):
             npu_input3 = item[1]
             cpu_output = cpu_op_scalar_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
             npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003)
+            self.assertRtolEqual(cpu_output, npu_output, prec16=0.02)
 
     
     def test_lerp_scalar_out_common_shape_format(self, device):
@@ -218,7 +218,7 @@ class TestLerp(TestCase):
             npu_input3 = item[1]
             cpu_output = cpu_op_scalar_out_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
             npu_output = self.npu_op_scalar_out_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003)
+            self.assertRtolEqual(cpu_output, npu_output, prec16=0.02)
      
 instantiate_device_type_tests(TestLerp, globals(), except_for='cpu')
 if __name__ == '__main__': 
diff --git a/test/test_npu/test_network_ops/test_lstm.py b/test/test_npu/test_network_ops/test_lstm.py
index ae03728ae62fabe9f71cb49307092bc6c73b789d..bf65ee9415b7584a518e3b35af5df8d39ed13e62 100644
--- a/test/test_npu/test_network_ops/test_lstm.py
+++ b/test/test_npu/test_network_ops/test_lstm.py
@@ -56,9 +56,12 @@ class TestLstm(TestCase):
             npu_input1 = torch.from_numpy(input1.astype(item[0][0])).npu()
             npu_output_y, (npu_output_h, npu_output_c) = npu_lstm(npu_input1)
 
-            self.assertRtolEqual(cpu_output_y.detach().numpy(), npu_output_y.cpu().to(torch.float).detach().numpy(), prec=1.e-1)
-            self.assertRtolEqual(cpu_output_h.detach().numpy(), npu_output_h.cpu().to(torch.float).detach().numpy(), prec=1.e-1)
-            self.assertRtolEqual(cpu_output_c.detach().numpy(), npu_output_c.cpu().to(torch.float).detach().numpy(), prec=1.e-1)
+            self.assertRtolEqual(cpu_output_y.detach().numpy(), 
+              npu_output_y.cpu().to(torch.float).detach().numpy(), prec=1.e-3)
+            self.assertRtolEqual(cpu_output_h.detach().numpy(), 
+              npu_output_h.cpu().to(torch.float).detach().numpy(), prec=1.e-3)
+            self.assertRtolEqual(cpu_output_c.detach().numpy(), 
+              npu_output_c.cpu().to(torch.float).detach().numpy(), prec=1.e-3)
 
     def test_lstm_double_layer(self, device):
         # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size, is_training]
@@ -112,8 +115,52 @@ class TestLstm(TestCase):
             self.assertRtolEqual(hn.detach().cpu().numpy(), hnf.cpu().detach().numpy())
             self.assertRtolEqual(cn.detach().cpu().numpy(), cnf.cpu().detach().numpy())
 
+    def test_lstm_bidirection(self, device):
+        # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size, is_training]
+        shape_format = [
+                        [[np.float16, (16, 32, 64)], [np.float16, (1, 32, 32)], 64, 32, True], 
+                        [[np.float16, (5, 32, 64)], [np.float16, (1, 32, 32)], 64, 32, False],
+                        [[np.float32, (5, 32, 64)], [np.float16, (1, 32, 64)],64, 64, True],
+                        [[np.float32, (5, 32, 64)], [np.float16, (1, 32, 64)], 64, 64, False],
+                        [[np.float32, (26, 2560, 512)], [np.float16, (1, 2560, 256)], 512, 256, False],
+                        [[np.float32, (10, 33, 128)], [np.float32, (1, 33, 64)], 128, 64, False],
+        ]
+
+        for item in shape_format: 
+            cpu_lstm = torch.nn.LSTM(input_size=item[2], hidden_size=item[3],
+                     num_layers=1, bidirectional=True, bias=False)
+            cpu_lstm.training = item[4]
+            npu_lstm = copy.deepcopy(cpu_lstm).npu()
+
+            cut_value = item[3]
+            iw = cpu_lstm.weight_ih_l0.split(cut_value)
+            hw = cpu_lstm.weight_hh_l0.split(cut_value)
+            iwr = cpu_lstm.weight_ih_l0_reverse.split(cut_value)
+            hwr = cpu_lstm.weight_hh_l0_reverse.split(cut_value)
+            iwt = torch.cat([iw[0], iw[2], iw[1], iw[3]], 0)
+            hwt = torch.cat([hw[0], hw[2], hw[1], hw[3]], 0)
+            iwrt = torch.cat([iwr[0], iwr[2], iwr[1], iwr[3]], 0)
+            hwrt = torch.cat([hwr[0], hwr[2], hwr[1], hwr[3]], 0)
+            cpu_lstm.weight_ih_l0.data = iwt
+            cpu_lstm.weight_hh_l0.data = hwt
+            cpu_lstm.weight_ih_l0_reverse.data = iwrt
+            cpu_lstm.weight_hh_l0_reverse.data = hwrt
+
+            input1 = np.random.uniform(0, 1, item[0][1]).astype(np.float32)
+
+            cpu_input1 = torch.from_numpy(input1)
+            cpu_output_y, (cpu_output_h, cpu_output_c) = cpu_lstm(cpu_input1)
+
+            npu_input1 = torch.from_numpy(input1.astype(item[0][0])).npu()
+            npu_output_y, (npu_output_h, npu_output_c) = npu_lstm(npu_input1)
+
+            self.assertRtolEqual(cpu_output_y.detach().numpy(), 
+              npu_output_y.cpu().to(torch.float).detach().numpy(), prec=1.e-3)
+            self.assertRtolEqual(cpu_output_h.detach().numpy(), 
+              npu_output_h.cpu().to(torch.float).detach().numpy(), prec=1.e-3)
+            self.assertRtolEqual(cpu_output_c.detach().numpy(), 
+              npu_output_c.cpu().to(torch.float).detach().numpy(), prec=1.e-3)
     
-    #如下 测试接口 lstm.data
     def test_lstm_sequence(self, device):    
         max_len = 6        
         embedding_size = 2 
@@ -178,8 +225,141 @@ class TestLstm(TestCase):
         pade_outputs_npu, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs_npu, batch_first=False)
         
         self.assertRtolEqual(pade_outputs.detach().numpy(), 
-            pade_outputs_npu.cpu().to(torch.float).detach().numpy(), prec=1.e-1)
+            pade_outputs_npu.cpu().to(torch.float).detach().numpy(), prec=1.e-4)
+
+    def test_lstm_sequence_bidirection(self, device):    
+        max_len = 6        
+        embedding_size = 2 
+        hidden_size = 16   
+        vocab_size = 20
+        input_seq = [[3, 5, 12, 7, 2, ], [4, 11, 14, ], [18, 7, 3, 8, 5, 4]]
+        lengths = [5, 3, 6]
+  
+        # embedding
+        embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=0)
+
+        rnn = torch.nn.LSTM(embedding_size, hidden_size, num_layers=1, bidirectional=True, bias=False)
+        rnn_npu = copy.deepcopy(rnn).npu()
+
+        iw = rnn.weight_ih_l0.split(hidden_size)
+        hw = rnn.weight_hh_l0.split(hidden_size)
+        iwr = rnn.weight_ih_l0_reverse.split(hidden_size)
+        hwr = rnn.weight_hh_l0_reverse.split(hidden_size)
+        iwt = torch.cat([iw[0], iw[2], iw[1], iw[3]], 0)
+        hwt = torch.cat([hw[0], hw[2], hw[1], hw[3]], 0)
+        iwrt = torch.cat([iwr[0], iwr[2], iwr[1], iwr[3]], 0)
+        hwrt = torch.cat([hwr[0], hwr[2], hwr[1], hwr[3]], 0)
+        rnn.weight_ih_l0.data = iwt
+        rnn.weight_hh_l0.data = hwt
+        rnn.weight_ih_l0_reverse.data = iwrt
+        rnn.weight_hh_l0_reverse.data = hwrt
+
+        #Sorting from Large to Small
+        input_seq = sorted(input_seq, key = lambda tp: len(tp), reverse=True)
+        lengths = sorted(lengths, key = lambda tp: tp, reverse=True)
+        '''
+        outputs:
+        input_seq: [[18, 7, 3, 8, 5, 4], [3, 5, 12, 7, 2], [4, 11, 14]]
+        lengths : [6, 5, 3]
+        '''
+        
+        #The padding subscript is 0
+        pad_token = 0
+        def pad_seq(seq, seq_len, max_length):
+            seq += [pad_token for _ in range(max_length - seq_len)]
+            return seq
+        
+        #Data after padding
+        pad_seqs = [] 
+        for i,j in zip(input_seq, lengths):
+            pad_seqs.append(pad_seq(i, j, max_len))
+            
+        lengths = [6,5,3]
+        pad_seqs = torch.tensor(pad_seqs)
+        embeded = embedding(pad_seqs)
+        embeded = embeded.reshape(6,3,2)
+
+        #cacl cpu
+        pack = torch.nn.utils.rnn.pack_padded_sequence(embeded, lengths, batch_first=False)
+        pade_outputs, (hn, cn) = rnn(pack)
+        pade_outputs, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs, batch_first=False)
+
+        #cacl npu
+        embeded_npu = embeded.npu()
+        pack = torch.nn.utils.rnn.pack_padded_sequence(embeded_npu, lengths, batch_first=False)
+        pade_outputs_npu, (hn_n, cn_n) = rnn_npu(pack)
+        pade_outputs_npu, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs_npu, batch_first=False)
+        
+        self.assertRtolEqual(pade_outputs.detach().numpy(), 
+            pade_outputs_npu.cpu().detach().numpy(), prec=1.e-4)
+            
+    def test_lstm_sequence_double_layer(self, device):    
+        max_len = 6        
+        embedding_size = 2 
+        hidden_size = 16   
+        vocab_size = 20
+        input_seq = [[3, 5, 12, 7, 2, ], [4, 11, 14, ], [18, 7, 3, 8, 5, 4]]
+        lengths = [5, 3, 6]
+  
+        # embedding
+        embedding = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=0)
+
+        rnn = torch.nn.LSTM(embedding_size, hidden_size, num_layers=2, bidirectional=False, bias=False)
+        rnn_npu = copy.deepcopy(rnn).npu()
+
+        iw0 = rnn.weight_ih_l0.split(hidden_size)
+        hw0 = rnn.weight_hh_l0.split(hidden_size)
+        iw1 = rnn.weight_ih_l1.split(hidden_size)
+        hw1 = rnn.weight_hh_l1.split(hidden_size)
+        iwt0 = torch.cat([iw0[0], iw0[2], iw0[1], iw0[3]], 0)
+        hwt0 = torch.cat([hw0[0], hw0[2], hw0[1], hw0[3]], 0)
+        iwt1 = torch.cat([iw1[0], iw1[2], iw1[1], iw1[3]], 0)
+        hwt1 = torch.cat([hw1[0], hw1[2], hw1[1], hw1[3]], 0)
+
+        rnn.weight_ih_l0.data = iwt0
+        rnn.weight_hh_l0.data = hwt0
+        rnn.weight_ih_l1.data = iwt1
+        rnn.weight_hh_l1.data = hwt1
+
+        #Sorting from Large to Small
+        input_seq = sorted(input_seq, key = lambda tp: len(tp), reverse=True)
+        lengths = sorted(lengths, key = lambda tp: tp, reverse=True)
+        '''
+        outputs:
+        input_seq: [[18, 7, 3, 8, 5, 4], [3, 5, 12, 7, 2], [4, 11, 14]]
+        lengths : [6, 5, 3]
+        '''
+        
+        #The padding subscript is 0
+        pad_token = 0
+        def pad_seq(seq, seq_len, max_length):
+            seq += [pad_token for _ in range(max_length - seq_len)]
+            return seq
+        
+        #Data after padding
+        pad_seqs = [] 
+        for i,j in zip(input_seq, lengths):
+            pad_seqs.append(pad_seq(i, j, max_len))
+            
+        lengths = [6,5,3]
+        pad_seqs = torch.tensor(pad_seqs)
+        embeded = embedding(pad_seqs)
+        embeded = embeded.reshape(6,3,2)
+
+        #cacl cpu
+        pack = torch.nn.utils.rnn.pack_padded_sequence(embeded, lengths, batch_first=False)
+        pade_outputs, (hn, cn) = rnn(pack)
+        pade_outputs, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs, batch_first=False)
 
+        #cacl npu
+        embeded_npu = embeded.npu()
+        pack = torch.nn.utils.rnn.pack_padded_sequence(embeded_npu, lengths, batch_first=False)
+        pade_outputs_npu, (hn_n, cn_n) = rnn_npu(pack)
+        pade_outputs_npu, others = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs_npu, batch_first=False)
+        
+        self.assertRtolEqual(pade_outputs.detach().numpy(), 
+            pade_outputs_npu.cpu().detach().numpy(), prec=1.e-4)
+            
 instantiate_device_type_tests(TestLstm, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests() 
diff --git a/test/test_npu/test_network_ops/test_mishbackward.py b/test/test_npu/test_network_ops/test_mish_backward.py
similarity index 71%
rename from test/test_npu/test_network_ops/test_mishbackward.py
rename to test/test_npu/test_network_ops/test_mish_backward.py
index 231188abb355c6d809c5bec7c3ed0f486b203f76..1240cb55b3ac30f375b59ba3cca882b1a2a0fd6d 100644
--- a/test/test_npu/test_network_ops/test_mishbackward.py
+++ b/test/test_npu/test_network_ops/test_mish_backward.py
@@ -31,14 +31,24 @@ class TestMishBackward(TestCase):
         output_grad = output_grad.detach().numpy()
         output = output.cpu().detach().numpy()
         return output_grad, output
+    
+    def cpu_op_exec(self, input1):
+        input1.requires_grad = True
+        output = input1 * (torch.tanh(F.softplus(input1)))
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output = output.detach().numpy()
+        return output_grad, output
 
     def test_mish_fp32(self, device):
         npu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.]).npu()
-        ep_output_grad = torch.tensor([1.0490363, 1.0693179, 1.021107, 1.0044329, 1.0008003, 1.0001341, 1.0000216, 1.0000033, 1.0000005, 1.0000001])
-        ep_npu_output = torch.tensor([0.8652344, 1.9439697, 2.9865417, 3.9974136, 4.999552, 5.9999266, 6.9999886, 7.999998, 8.999999, 10.])
+        cpu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.])
         output_grad, npu_output = self.npu_op_exec(npu_input)
-        self.assertRtolEqual(ep_output_grad.numpy(), output_grad)
-        self.assertRtolEqual(ep_npu_output.numpy(), npu_output)
+        ep_output_grad, ep_npu_output = self.cpu_op_exec(cpu_input)
+        self.assertRtolEqual(ep_output_grad, output_grad)
+        self.assertRtolEqual(ep_npu_output, npu_output)
 
 instantiate_device_type_tests(TestMishBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
diff --git a/test/test_npu/test_network_ops/test_nms_with_mask.py b/test/test_npu/test_network_ops/test_nms_with_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..39ee878cd77f69b10b2c882544839ecc3a4ef533
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_nms_with_mask.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNmsWithMask(TestCase):
+    def npu_op_exec(self, input1, iou_threshold):
+        npu_output1, npu_output2, npu_output3, = torch.npu_nms_with_mask(input1, iou_threshold)
+        npu_output1 = npu_output1.to("cpu")
+        npu_output2 = npu_output2.to("cpu")
+        npu_output3 = npu_output3.to("cpu")
+
+        return npu_output1, npu_output2, npu_output3
+
+    def test_nms_with_mask_float32(self, device):
+        input1 = torch.tensor([[0.0, 1.0, 2.0, 3.0, 0.6], [6.0, 7.0, 8.0, 9.0, 0.4]]).npu()
+        iou_threshold = 0.5
+
+        eq_output1 = torch.tensor([[0.0000, 1.0000, 2.0000, 3.0000, 0.6001],
+                                   [6.0000, 7.0000, 8.0000, 9.0000, 0.3999]])
+        eq_output2 = torch.tensor([0, 1], dtype=torch.int32)
+        eq_output3 = torch.tensor([1, 1], dtype=torch.uint8)
+
+        npu_output1, npu_output2, npu_output3 = self.npu_op_exec(input1, iou_threshold)
+
+        self.assertRtolEqual(eq_output1, npu_output1)
+        self.assertRtolEqual(eq_output2, npu_output2)
+        self.assertRtolEqual(eq_output3, npu_output3)
+
+
+instantiate_device_type_tests(TestNmsWithMask, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests() 
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_npu_linear.py b/test/test_npu/test_network_ops/test_npu_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea9e7c2e2f507d69f4bcf3446babe2c4141cf6c0
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_npu_linear.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuLinear(TestCase):
+    def cpu_op_exec(self, x, weight, bias):
+        output = torch.nn.functional.linear(x, weight, bias)
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec(self, x, weight, bias):
+        output = torch.npu_linear(x, weight, bias)
+        output = output.cpu().numpy()
+        return output
+
+    def test_npu_linear_shape_format_fp32(self, device):
+        shape_format = [
+                [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
+                [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
+            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
+    
+    def test_npu_linear_shape_format_fp16(self, device):
+        shape_format = [
+                [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
+                [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float()).astype(np.float16)
+            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestNpuLinear, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_npu_linear_backward.py b/test/test_npu/test_network_ops/test_npu_linear_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..66f8a47f4143ac56fa0afe457ecbe0f9ebdc9268
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_npu_linear_backward.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuLinearBackward(TestCase):
+    def cpu_op_exec(self, x, weight, bias):
+        x.requires_grad = True
+        weight.requires_grad = True
+        bias.requires_grad = True
+        output = torch.nn.functional.linear(x, weight, bias)
+        loss = output.sum()
+        loss.backward()
+        return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()
+    
+    def npu_op_exec(self, x, weight, bias):
+        x.requires_grad = True
+        weight.requires_grad = True
+        bias.requires_grad = True
+        output = torch.npu_linear(x, weight, bias)
+        loss = output.sum()
+        loss.backward()
+        return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()
+
+    def test_npu_linear_backward_shape_format_fp32(self, device):
+        shape_format = [
+                [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
+                [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
+            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
+            self.assertRtolEqual(cpu_x_grad, npu_x_grad)
+            self.assertRtolEqual(cpu_w_grad, npu_w_grad)
+            self.assertRtolEqual(cpu_b_grad, npu_b_grad)
+    
+    def test_npu_linear_shape_format_fp16(self, device):
+        shape_format = [
+                [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
+                [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(
+                cpu_x.float(), cpu_w.float(), cpu_b.float())
+            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output.astype(np.float16), npu_output)
+            self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad)
+            self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad)
+            self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad)
+
+instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_soft_margin_loss.py b/test/test_npu/test_network_ops/test_soft_margin_loss.py
index a83172e56db3bd4c6b6247a2b622b1f03bd277c0..9c8308738fee4d74c75dd8bcb07f848c68cc025a 100644
--- a/test/test_npu/test_network_ops/test_soft_margin_loss.py
+++ b/test/test_npu/test_network_ops/test_soft_margin_loss.py
@@ -92,7 +92,8 @@ class TestSoftMarginLoss(TestCase):
         self.assertRtolEqual(cpu_output, npu_output)
 
     def test_soft_margin_loss_float16_sum(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (37, 8, 20, 20, 5, 8, 10, 8), (37, 8, 20, 20, 1, 1, 1, 1), np.float16)
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 8, 2, 2, 5, 8, 2, 8), 
+                                                   (1, 8, 2, 2, 1, 1, 1, 1), np.float16)
         cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum")
         npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum")
         self.assertRtolEqual(cpu_output, npu_output)
diff --git a/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py b/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..f75f627040a738f9c4ee208c98458fb6f2966ba2
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestUpsampleBicubic2dBackward(TestCase):
+
+    def cpu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w):
+        input1.requires_grad = True
+        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.detach().numpy()
+        return output_grad
+
+    def npu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w):
+        input1.requires_grad = True
+        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu").detach().numpy()
+        return output_grad
+
+
+    def test_upsample_bicubic2d_common_shape_format(self, device):
+        shape_format = [
+                        [[np.float32, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255],
+                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],
+                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],
+                        [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 3402823500.0]
+                       ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3], item[4])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+    def test_upsample_bicubic2d_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1, output_size, align_corners, scale_h, scale_w):
+            input1 = input1.to(torch.float32)
+            input1.requires_grad = True
+            output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+            output.backward(torch.ones_like(output))
+            output_grad = input1.grad
+            output_grad = output_grad.detach().numpy()
+            output_grad = output_grad.astype(np.float16)
+            return output_grad
+    
+        shape_format = [
+                        [[np.float16, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],
+                        [[np.float16, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float16, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 6550.0],
+                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],
+                        [[np.float16, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],
+                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 6550.0]
+                       ]
+        
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
+            cpu_output = cpu_op_exec_fp16(cpu_input1, item[1], item[2], item[3], item[4])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestUpsampleBicubic2dBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()