diff --git a/patch/npu.patch b/patch/npu.patch
index 5c7fa69d23b0705a6b5dda56e75ef804a3ff2321..a46a27034ab9d7433a4e60c9d68d86898a340dab 100644
--- a/patch/npu.patch
+++ b/patch/npu.patch
@@ -1,6 +1,6 @@
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt
 --- pytorch-v1.5.0/aten/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/CMakeLists.txt	2021-06-25 16:37:35.486258833 +0800
++++ pytorch-develop/aten/CMakeLists.txt	2021-07-05 14:59:26.416336304 +0800
 @@ -22,8 +22,10 @@
  set(ATen_CPU_INCLUDE)
  set(ATen_THIRD_PARTY_INCLUDE)
@@ -49,9 +49,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
  set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-06-25 16:37:35.486258833 +0800
++++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-05 14:59:26.416336304 +0800
 @@ -67,6 +67,9 @@
  FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h")
  FILE(GLOB native_cpu_h "native/cpu/*.h")
@@ -127,9 +127,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
  set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
  set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h
 --- pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-06-25 16:37:35.494258894 +0800
++++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-05 14:59:26.424336365 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -168,9 +168,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      }
      catchallKernel_ = std::move(kernel);
    }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py
 --- pytorch-v1.5.0/aten/src/ATen/function_wrapper.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-06-25 16:37:35.502258955 +0800
++++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-05 14:59:26.432336426 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -353,9 +353,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
      for declaration in declarations:
          for option in declaration['options']:
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py
 --- pytorch-v1.5.0/aten/src/ATen/gen.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/gen.py	2021-06-25 16:37:35.502258955 +0800
++++ pytorch-develop/aten/src/ATen/gen.py	2021-07-05 14:59:26.432336426 +0800
 @@ -1,3 +1,18 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -511,9 +511,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +    npu_file_manager.write_outputs(options.output_dependencies + "-npu")
  else:
      generate_outputs()
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-06-25 16:37:35.514259047 +0800
++++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-05 14:59:26.444336518 +0800
 @@ -339,20 +339,20 @@
  
  void hardsigmoid_backward_kernel(TensorIterator& iter) {
@@ -539,9 +539,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            return Vec::blendv(kZeroVec, grad_val * kOneSixthVec, gradNonZeroMask);
          });
    });
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-06-25 16:37:35.506258986 +0800
++++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-05 14:59:26.440336488 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -594,9 +594,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto storage = Storage(
        self.dtype(),
        detail::computeStorageSize(self.sizes(), self.strides()),
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml
 --- pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-06-25 16:37:35.526259138 +0800
++++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-05 14:59:26.460336640 +0800
 @@ -1,6 +1,5 @@
  # See README.md in this directory for more guidance
  
@@ -998,7 +998,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
    variants: function
-@@ -503,13 +598,17 @@
+@@ -503,6 +598,8 @@
      CPU: bernoulli_tensor_cpu_
      CUDA: bernoulli_tensor_cuda_
    supports_named_tensor: True
@@ -1007,11 +1007,10 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
    variants: method
-   dispatch:
+@@ -510,6 +607,8 @@
      CPU: bernoulli_scalar_cpu_
      CUDA: bernoulli_scalar_cuda_
--  supports_named_tensor: True
-+  supports_named_tensor: True 
+   supports_named_tensor: True
 +  npu_dispatch:
 +    NPU: bernoulli_npu_
  
@@ -1394,7 +1393,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: cosh_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -825,13 +1030,17 @@
+@@ -825,12 +1030,16 @@
    dispatch:
      CPU: _cosh__cpu
      CUDA: _cosh__cuda
@@ -1406,13 +1405,11 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    dispatch:
      CPU: _cosh_out_cpu
      CUDA: _cosh_out_cuda
--
 +  npu_dispatch:
 +    NPU: cosh_out_npu
-+    
+ 
  - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
    use_c10_dispatcher: full
- 
 @@ -897,6 +1106,50 @@
    dispatch:
      CUDA: cudnn_convolution_transpose_backward_weight
@@ -1514,7 +1511,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
    supports_named_tensor: True
-@@ -976,25 +1245,33 @@
+@@ -976,20 +1245,28 @@
    supports_named_tensor: True
  
  - func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
@@ -1544,12 +1541,6 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: det(Tensor self) -> Tensor
    use_c10_dispatcher: full
-   variants: function, method
--
-+    
- - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
-   use_c10_dispatcher: full
-   variants: function, method
 @@ -1013,6 +1290,8 @@
  
  - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
@@ -1601,7 +1592,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: dot(Tensor self, Tensor tensor) -> Tensor
    use_c10_dispatcher: full
-@@ -1057,30 +1346,42 @@
+@@ -1057,29 +1346,41 @@
    dispatch:
      CPU: legacy::cpu::_th_dot
      CUDA: legacy::cuda::_th_dot
@@ -1638,13 +1629,11 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    dispatch:
      CPU: embedding_renorm_cpu_
      CUDA: embedding_renorm_cuda_
--
 +  npu_dispatch:
 +    NPU: embedding_renorm_npu_
-+  
+ 
  - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
    use_c10_dispatcher: full
- 
 @@ -1099,6 +1400,8 @@
    dispatch:
      CPU: _embedding_bag_cpu
@@ -1835,7 +1824,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      SparseCPU: floor_divide_sparse
      SparseCUDA: floor_divide_sparse
    supports_named_tensor: True
-+  npu_dispatch: 
++  npu_dispatch:
 +    NPU: floor_divide_npu
  
  - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -1844,7 +1833,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      SparseCPU: floor_divide_sparse_
      SparseCUDA: floor_divide_sparse_
    supports_named_tensor: True
-+  npu_dispatch: 
++  npu_dispatch:
 +    NPU: floor_divide_npu_
  
  - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1853,13 +1842,13 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      SparseCPU: floor_divide_out_sparse_zerodim
      SparseCUDA: floor_divide_out_sparse_zerodim
    supports_named_tensor: True
-+  npu_dispatch: 
++  npu_dispatch:
 +    NPU: floor_divide_out_npu
  
  - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
    variants: function, method
    supports_named_tensor: True
-+  npu_dispatch: 
++  npu_dispatch:
 +    NPU: floor_divide_npu
  
  - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -1915,7 +1904,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
    use_c10_dispatcher: full
-@@ -1390,32 +1768,53 @@
+@@ -1390,23 +1768,39 @@
    dispatch:
      CPU: grid_sampler_3d_cpu
      CUDA: grid_sampler_3d_cuda
@@ -1942,23 +1931,20 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +    NPU: hamming_window_npu
  
  - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
--
 +  npu_dispatch:
 +    NPU: hamming_window_npu
-+     
+ 
  - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
--
 +  npu_dispatch:
 +    NPU: hamming_window_npu
-+     
+ 
  - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
--
 +  npu_dispatch:
 +    NPU: hamming_window_npu
-+    
+ 
  - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
    use_c10_dispatcher: full
- 
+@@ -1414,8 +1808,13 @@
  - func: ger(Tensor self, Tensor vec2) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -2072,7 +2058,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    dispatch:
      CPU: kthvalue_out_cpu
      CUDA: kthvalue_out_cuda
-+  npu_dispatch: 
++  npu_dispatch:
 +    NPU: kthvalue_out_npu
  
  - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -2083,7 +2069,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
    supports_named_tensor: True
-+  npu_dispatch: 
++  npu_dispatch:
 +    NPU: kthvalue_out_npu
  
  - func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
@@ -2104,7 +2090,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
    python_module: nn
-@@ -1622,26 +2055,36 @@
+@@ -1622,46 +2055,64 @@
    use_c10_dispatcher: full
  
  - func: linspace(Scalar start, Scalar end, int steps=100, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2141,7 +2127,26 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: log10(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -1662,6 +2105,8 @@
+   supports_named_tensor: True
+   variants: function, method
++  npu_dispatch:
++    NPU: log10_npu
+ 
+ - func: log10_(Tensor(a!) self) -> Tensor(a!)
+   supports_named_tensor: True
+   variants: function, method
++  npu_dispatch:
++    NPU: log10_npu_
+ 
+ - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+   supports_named_tensor: True
+   dispatch:
+     CPU: log10_out
+     CUDA: log10_out
++  npu_dispatch:
++    NPU: log10_out_npu
+ 
+ - func: log1p(Tensor self) -> Tensor
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -2150,7 +2155,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: log1p_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -1671,6 +2116,8 @@
+@@ -1671,6 +2122,8 @@
      CUDA: log1p_
      SparseCPU: log1p_sparse_
      SparseCUDA: log1p_sparse_
@@ -2159,7 +2164,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -1679,67 +2126,95 @@
+@@ -1679,67 +2132,95 @@
      CUDA: log1p_out
      SparseCPU: log1p_out_sparse
      SparseCUDA: log1p_out_sparse
@@ -2255,7 +2260,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
    use_c10_dispatcher: full
-@@ -1748,9 +2223,13 @@
+@@ -1748,9 +2229,13 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -2269,7 +2274,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor
    use_c10_dispatcher: full
-@@ -1761,26 +2240,40 @@
+@@ -1761,26 +2246,40 @@
  - func: matrix_power(Tensor self, int n) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -2310,7 +2315,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -1791,6 +2284,8 @@
+@@ -1791,6 +2290,8 @@
  
  - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
    supports_named_tensor: True
@@ -2319,7 +2324,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
    requires_tensor: True
-@@ -1801,6 +2296,8 @@
+@@ -1801,6 +2302,8 @@
    requires_tensor: True
    dispatch:
      QuantizedCPU: quantized_max_pool2d
@@ -2328,7 +2333,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
    supports_named_tensor: True
-@@ -1814,6 +2311,8 @@
+@@ -1814,6 +2317,8 @@
      CPU: mean_cpu_gpu
      CUDA: mean_cpu_gpu
      QuantizedCPU: quantized_mean_cpu
@@ -2337,7 +2342,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
    variants: function, method
-@@ -1822,6 +2321,8 @@
+@@ -1822,6 +2327,8 @@
      CPU: mean_cpu_gpu
      CUDA: mean_cpu_gpu
      QuantizedCPU: quantized_mean_cpu
@@ -2346,7 +2351,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mean.out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -1829,47 +2330,73 @@
+@@ -1829,47 +2336,73 @@
      CPU: mean_out_cpu_gpu
      CUDA: mean_out_cpu_gpu
      QuantizedCPU: quantized_mean_out_cpu
@@ -2420,7 +2425,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
  
-@@ -1958,6 +2485,8 @@
+@@ -1958,6 +2491,8 @@
      CUDA: legacy::cuda::_th_mm
      SparseCPU: _sparse_mm
      SparseCUDA: _sparse_mm
@@ -2429,7 +2434,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
-@@ -1966,6 +2495,8 @@
+@@ -1966,6 +2501,8 @@
      CUDA: legacy::cuda::_th_mm_out
      SparseCPU: _sparse_mm_out
      SparseCUDA: _sparse_mm_out
@@ -2438,7 +2443,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
-@@ -1974,9 +2505,13 @@
+@@ -1974,9 +2511,13 @@
  - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
    supports_named_tensor: True
    variants: function, method
@@ -2452,7 +2457,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
    variants: function, method
-@@ -1994,6 +2529,8 @@
+@@ -1994,6 +2535,8 @@
      SparseCPU: mul_sparse
      SparseCUDA: mul_sparse
      MkldnnCPU: mkldnn_mul
@@ -2461,7 +2466,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-@@ -2004,6 +2541,8 @@
+@@ -2004,6 +2547,8 @@
      SparseCPU: mul_sparse_
      SparseCUDA: mul_sparse_
      MkldnnCPU: mkldnn_mul_
@@ -2470,7 +2475,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-@@ -2013,15 +2552,21 @@
+@@ -2013,15 +2558,21 @@
      SparseCPU: mul_out_sparse_cpu
      SparseCUDA: mul_out_sparse_cuda
      MkldnnCPU: mkldnn_mul_out
@@ -2492,7 +2497,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mv(Tensor self, Tensor vec) -> Tensor
    use_c10_dispatcher: full
-@@ -2030,12 +2575,16 @@
+@@ -2030,12 +2581,16 @@
      CPU: mv_cpu
      CUDA: legacy::cuda::_th_mv
    supports_named_tensor: True
@@ -2509,7 +2514,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mvlgamma(Tensor self, int p) -> Tensor
    use_c10_dispatcher: full
-@@ -2052,6 +2601,8 @@
+@@ -2052,6 +2607,8 @@
      CUDA: narrow_copy_dense
      SparseCPU: narrow_copy_sparse
      SparseCUDA: narrow_copy_sparse
@@ -2518,7 +2523,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
    variants: function, method
-@@ -2068,6 +2619,8 @@
+@@ -2068,6 +2625,8 @@
      CPU: batch_norm_cpu
      CUDA: batch_norm_cuda
      MkldnnCPU: mkldnn_batch_norm
@@ -2527,7 +2532,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    dispatch:
-@@ -2098,6 +2651,8 @@
+@@ -2098,6 +2657,8 @@
    dispatch:
      CPU: batch_norm_backward_cpu
      CUDA: batch_norm_backward_cuda
@@ -2536,7 +2541,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
    dispatch:
-@@ -2117,6 +2672,8 @@
+@@ -2117,6 +2678,8 @@
  
  - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
    variants: function
@@ -2545,7 +2550,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
    variants: function
-@@ -2129,42 +2686,60 @@
+@@ -2129,42 +2692,60 @@
  
  - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    device_guard: False
@@ -2591,10 +2596,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _pdist_forward(Tensor self, float p=2) -> Tensor
    use_c10_dispatcher: full
--
 +  npu_dispatch:
 +    NPU: _pdist_forward_npu
-+    
+ 
  - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
    use_c10_dispatcher: full
  
@@ -2602,33 +2606,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- func: cosine_similarity(Tensor input, Tensor input2, int dim=1, float eps=1e-08) -> Tensor
    use_c10_dispatcher: full
    variants: function
--
-+  
+ 
  - func: permute(Tensor(a) self, int[] dims) -> Tensor(a)
 -  variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 +  variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
  
  # Only exposed from C++ -- in Python,
  # we expose it as an attribute `T`, not a function.
-@@ -2178,7 +2753,7 @@
- 
- - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
-   use_c10_dispatcher: full
--
-+    
- - func: is_pinned(Tensor self) -> bool
-   use_c10_dispatcher: full
-   variants: method
-@@ -2195,7 +2770,7 @@
- - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
-   use_c10_dispatcher: full
-   variants: function
--
-+ 
- - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
- 
- - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-@@ -2253,54 +2828,82 @@
+@@ -2253,54 +2834,82 @@
    supports_named_tensor: True
  
  - func: randperm(int n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2712,7 +2697,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor
    use_c10_dispatcher: full
-@@ -2316,6 +2919,8 @@
+@@ -2316,6 +2925,8 @@
  - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -2721,7 +2706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: reshape(Tensor self, int[] shape) -> Tensor
    variants: function, method
-@@ -2337,16 +2942,22 @@
+@@ -2337,16 +2948,22 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -2744,7 +2729,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
  
-@@ -2360,6 +2971,8 @@
+@@ -2360,6 +2977,8 @@
      CUDA: relu
      MkldnnCPU: mkldnn_relu
      QuantizedCPU: quantized_relu
@@ -2753,7 +2738,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: relu_(Tensor(a!) self) -> Tensor(a!)
-@@ -2370,6 +2983,8 @@
+@@ -2370,6 +2989,8 @@
      CUDA: relu_
      MkldnnCPU: mkldnn_relu_
      QuantizedCPU: quantized_relu_
@@ -2762,7 +2747,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: prelu(Tensor self, Tensor weight) -> Tensor
    use_c10_dispatcher: full
-@@ -2377,12 +2992,16 @@
+@@ -2377,12 +2998,16 @@
    dispatch:
      CPU: prelu_cpu
      CUDA: prelu_cuda
@@ -2779,17 +2764,16 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gelu(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2390,36 +3009,50 @@
+@@ -2390,6 +3015,8 @@
    dispatch:
      CPU: gelu_cpu
      CUDA: gelu_cuda
--
 +  npu_dispatch:
 +     NPU: gelu_npu
-+     
+ 
  - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
    use_c10_dispatcher: full
-   python_module: nn
+@@ -2397,29 +3024,41 @@
    dispatch:
      CPU: gelu_backward_cpu
      CUDA: gelu_backward_cuda
@@ -2831,7 +2815,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
    variants: function, method
-@@ -2433,15 +3066,18 @@
+@@ -2433,14 +3072,21 @@
  
  - func: selu(Tensor self) -> Tensor
    use_c10_dispatcher: full
@@ -2844,14 +2828,17 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
    use_c10_dispatcher: full
++  npu_dispatch:
++    NPU: celu_npu
  
  - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
- 
 -
++  npu_dispatch:
++    NPU: celu_npu_
+ 
  - func: sigmoid(Tensor self) -> Tensor
    use_c10_dispatcher: full
-   supports_named_tensor: True
-@@ -2451,6 +3087,8 @@
+@@ -2451,6 +3097,8 @@
      CUDA: sigmoid
      QuantizedCPU: quantized_sigmoid
      MkldnnCPU: mkldnn_sigmoid
@@ -2860,7 +2847,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2459,36 +3097,52 @@
+@@ -2459,36 +3107,52 @@
      CPU: sigmoid_
      CUDA: sigmoid_
      MkldnnCPU: mkldnn_sigmoid_
@@ -2913,7 +2900,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Returns a copy of this `Variable` that is detached from its autograd graph.
  # This method is OK to call if the `Variable` is a view.
-@@ -2533,6 +3187,8 @@
+@@ -2533,6 +3197,8 @@
  
  - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
    variants: function, method
@@ -2922,7 +2909,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: smm(Tensor self, Tensor mat2) -> Tensor
    use_c10_dispatcher: full
-@@ -2542,10 +3198,14 @@
+@@ -2542,10 +3208,14 @@
  - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
    variants: function, method
    supports_named_tensor: True
@@ -2937,7 +2924,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
    use_c10_dispatcher: full
-@@ -2553,12 +3213,16 @@
+@@ -2553,12 +3223,16 @@
      CPU: softmax_cpu
      CUDA: softmax_cuda
      MkldnnCPU: mkldnn_softmax
@@ -2954,7 +2941,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
    variants: function, method
-@@ -2609,8 +3273,12 @@
+@@ -2609,8 +3283,12 @@
      SparseCUDA: _sspaddmm_out_cuda
  
  - func: stack(Tensor[] tensors, int dim=0) -> Tensor
@@ -2967,7 +2954,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # The signature is designed to be consistent with librosa except that it is
  # missing the `pad_mode` and `center` arguments, which are taken care of at
-@@ -2633,20 +3301,30 @@
+@@ -2633,20 +3311,30 @@
  - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
    variants: function, method
    supports_named_tensor: True
@@ -2998,7 +2985,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sum_to_size(Tensor self, int[] size) -> Tensor
    variants: method
-@@ -2656,13 +3334,19 @@
+@@ -2656,13 +3344,19 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -3018,7 +3005,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: square(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2677,51 +3361,81 @@
+@@ -2677,51 +3371,81 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3081,19 +3068,17 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
--
 +  npu_dispatch:
 +    NPU: prod_out_npu
 +    #NPU: prod_out_npu_ext
-+    
+ 
  - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
    variants: function, method
    supports_named_tensor: True
--
 +  npu_dispatch:
 +    NPU: prod_npu
 +    #NPU: prod_npu_ext
-+    
+ 
  - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
 -
@@ -3103,7 +3088,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: t(Tensor(a) self) -> Tensor(a)
    device_guard: False
-@@ -2736,6 +3450,8 @@
+@@ -2736,6 +3460,8 @@
    use_c10_dispatcher: full
    supports_named_tensor: True
    variants: function, method
@@ -3112,7 +3097,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tan_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2743,12 +3459,16 @@
+@@ -2743,12 +3469,16 @@
    dispatch:
      CPU: _tan__cpu
      CUDA: _tan__cuda
@@ -3129,7 +3114,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tanh(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -2758,6 +3478,8 @@
+@@ -2758,6 +3488,8 @@
      CPU: tanh
      CUDA: tanh
      QuantizedCPU: quantized_tanh
@@ -3138,7 +3123,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tanh_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -2765,12 +3487,16 @@
+@@ -2765,12 +3497,16 @@
    dispatch:
      CPU: _tanh__cpu
      CUDA: _tanh__cuda
@@ -3155,7 +3140,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
    variants: function
-@@ -2783,6 +3509,8 @@
+@@ -2783,6 +3519,8 @@
    dispatch:
      CPU: threshold
      CUDA: threshold_cuda
@@ -3164,7 +3149,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
    variants: function
-@@ -2790,12 +3518,16 @@
+@@ -2790,12 +3528,16 @@
    dispatch:
      CPU: threshold_
      CUDA: threshold__cuda
@@ -3181,7 +3166,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
    use_c10_dispatcher: full
-@@ -2803,6 +3535,8 @@
+@@ -2803,6 +3545,8 @@
    dispatch:
      CPU: threshold_backward
      CUDA: threshold_backward_cuda
@@ -3190,7 +3175,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
    variants: function, method
-@@ -2835,18 +3569,24 @@
+@@ -2835,18 +3579,24 @@
    use_c10_dispatcher: full
    python_module: nn
    variants: function
@@ -3215,7 +3200,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # default int[] value [0,1] should not add space after comma, since native_parse.py uses ', ' to split args
  
-@@ -2872,6 +3612,8 @@
+@@ -2872,6 +3622,8 @@
      CUDA: true_divide
      SparseCPU: true_divide_sparse
      SparseCUDA: true_divide_sparse
@@ -3224,7 +3209,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-@@ -2881,6 +3623,8 @@
+@@ -2881,6 +3633,8 @@
      CUDA: true_divide_
      SparseCPU: true_divide_sparse_
      SparseCUDA: true_divide_sparse_
@@ -3233,7 +3218,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-@@ -2889,31 +3633,43 @@
+@@ -2889,31 +3643,43 @@
      CUDA: true_divide_out
      SparseCPU: true_divide_out_sparse_zerodim
      SparseCUDA: true_divide_out_sparse_zerodim
@@ -3277,7 +3262,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: type_as(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -2956,6 +3712,8 @@
+@@ -2956,6 +3722,8 @@
    dispatch:
      CPU: _unique2_cpu
      CUDA: _unique2_cuda
@@ -3286,7 +3271,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _unsafe_view(Tensor self, int[] size) -> Tensor
  
-@@ -2971,32 +3729,48 @@
+@@ -2971,32 +3739,48 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3335,7 +3320,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: view_as(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -3009,17 +3783,23 @@
+@@ -3009,13 +3793,19 @@
  - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
    variants: function, method
@@ -3355,12 +3340,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
    variants: function
--
-+  
- # VariableType::_weight_norm does not want to be given a gap in the autograd graph,
- # so we don't define "dispatch" variants for it.
- - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
-@@ -3041,13 +3821,21 @@
+@@ -3041,13 +3831,21 @@
  
  - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    device_guard: False
@@ -3382,7 +3362,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
    use_c10_dispatcher: full
-@@ -3100,25 +3888,37 @@
+@@ -3100,25 +3898,37 @@
  
  - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
    dispatch:
@@ -3422,7 +3402,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
    variants: function, method
-@@ -3162,12 +3962,16 @@
+@@ -3162,12 +3972,16 @@
      SparseCUDA: clone_sparse
      MkldnnCPU: mkldnn_clone
      QuantizedCPU: quantized_clone
@@ -3439,7 +3419,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -3176,6 +3980,8 @@
+@@ -3176,6 +3990,8 @@
      CUDA: pow_out
      SparseCPU: pow_out_sparse_scalar
      SparseCUDA: pow_out_sparse_scalar
@@ -3448,7 +3428,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -3186,6 +3992,8 @@
+@@ -3186,6 +4002,8 @@
      CUDA: pow
      SparseCPU: pow_sparse_scalar
      SparseCUDA: pow_sparse_scalar
@@ -3457,7 +3437,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: zero_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -3196,6 +4004,14 @@
+@@ -3196,6 +4014,14 @@
      SparseCPU: zero_sparse_
      SparseCUDA: zero_sparse_
      MkldnnCPU: mkldnn_zero_
@@ -3472,7 +3452,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
    dispatch:
-@@ -3204,6 +4020,8 @@
+@@ -3204,6 +4030,8 @@
      SparseCPU: sub_out_sparse
      SparseCUDA: sub_out_sparse
    supports_named_tensor: True
@@ -3481,7 +3461,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
    use_c10_dispatcher: full
-@@ -3213,6 +4031,8 @@
+@@ -3213,6 +4041,8 @@
      CUDA: sub
      SparseCPU: sub_sparse
      SparseCUDA: sub_sparse
@@ -3490,7 +3470,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-@@ -3222,6 +4042,8 @@
+@@ -3222,6 +4052,8 @@
      CUDA: sub_
      SparseCPU: sub_sparse_
      SparseCUDA: sub_sparse_
@@ -3499,7 +3479,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  # For C++ only, until we have conversion from C++ numbers to Tensor
-@@ -3229,21 +4051,29 @@
+@@ -3229,21 +4061,29 @@
    use_c10_dispatcher: full
    variants: function, method
    supports_named_tensor: True
@@ -3529,7 +3509,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Functionally the same as addmm, but we give it a different derivative formula
  # that doesn't propagate gradients to non-present entries on sparse.
-@@ -3257,6 +4087,8 @@
+@@ -3257,6 +4097,8 @@
      CUDA: legacy::cuda::_th_addmm_out
      SparseCPU: addmm_out_sparse_dense_cpu
      SparseCUDA: addmm_out_sparse_dense_cuda
@@ -3538,7 +3518,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-@@ -3267,6 +4099,8 @@
+@@ -3267,6 +4109,8 @@
      CUDA: legacy::cuda::_th_addmm
      SparseCPU: addmm_sparse_dense_cpu
      SparseCUDA: addmm_sparse_dense_cuda
@@ -3547,7 +3527,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-@@ -3278,9 +4112,10 @@
+@@ -3278,9 +4122,10 @@
      # broadcasting
      SparseCPU: s_addmm_sparse_dense_cpu_
      SparseCUDA: s_addmm_sparse_dense_cuda_
@@ -3559,7 +3539,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # NOTE [ Sparse: autograd and API ]
  #
  #
-@@ -3396,7 +4231,6 @@
+@@ -3396,7 +4241,6 @@
  # shared. In other words, their outputs are non-differentiable views of the
  # sparse tensor.
  
@@ -3567,7 +3547,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
  # the default would never make sense.
  - func: sparse_coo_tensor.size(int[] size, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False) -> Tensor
-@@ -3433,7 +4267,6 @@
+@@ -3433,7 +4277,6 @@
      SparseCUDA: sparse_resize_and_clear_
    requires_tensor: True
  
@@ -3575,7 +3555,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3442,7 +4275,6 @@
+@@ -3442,7 +4285,6 @@
      SparseCUDA: sparse_mask_cuda
    requires_tensor: True
  
@@ -3583,7 +3563,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: to_dense(Tensor self) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3474,7 +4306,6 @@
+@@ -3474,7 +4316,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3591,7 +3571,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: dense_dim(Tensor self) -> int
    use_c10_dispatcher: full
    variants: method
-@@ -3494,7 +4325,6 @@
+@@ -3494,7 +4335,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3599,7 +3579,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: _nnz(Tensor self) -> int
    use_c10_dispatcher: full
    variants: method
-@@ -3504,7 +4334,6 @@
+@@ -3504,7 +4344,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3607,7 +3587,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: coalesce(Tensor self) -> Tensor
    use_c10_dispatcher: full
    variants: method
-@@ -3513,7 +4342,6 @@
+@@ -3513,7 +4352,6 @@
      SparseCUDA: coalesce_sparse_cuda
    requires_tensor: True
  
@@ -3615,7 +3595,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: is_coalesced(Tensor self) -> bool
    use_c10_dispatcher: full
    variants: method
-@@ -3524,7 +4352,6 @@
+@@ -3524,7 +4362,6 @@
    device_guard: False
    supports_named_tensor: True
  
@@ -3623,7 +3603,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: _indices(Tensor(a) self) -> Tensor(a)
    variants: method
    dispatch:
-@@ -3568,7 +4395,6 @@
+@@ -3568,7 +4405,6 @@
    requires_tensor: True
    device_guard: False
  
@@ -3631,7 +3611,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
    dispatch:
      SparseCPU: hspmm_out_sparse_cpu
-@@ -3630,11 +4456,15 @@
+@@ -3630,11 +4466,15 @@
    variants: function
    dispatch:
      CPU: quantize_per_tensor_cpu
@@ -3647,7 +3627,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: dequantize(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -3713,20 +4543,28 @@
+@@ -3713,20 +4553,28 @@
    variants: method
    device_guard: False
    supports_named_tensor: True
@@ -3676,7 +3656,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: meshgrid(Tensor[] tensors) -> Tensor[]
  
-@@ -3765,6 +4603,8 @@
+@@ -3765,6 +4613,8 @@
    dispatch:
      CPU: _local_scalar_dense_cpu
      CUDA: _local_scalar_dense_cuda
@@ -3685,7 +3665,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    variants: function
    supports_named_tensor: True
  
-@@ -3791,10 +4631,16 @@
+@@ -3791,10 +4641,16 @@
  
  # RNN cells and layers
  - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
@@ -3702,7 +3682,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
  
-@@ -3839,10 +4685,14 @@
+@@ -3839,10 +4695,14 @@
  
  # PackedSequence utilities
  - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
@@ -3717,7 +3697,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # wrappers for legacy TH methods
  
-@@ -3852,6 +4702,8 @@
+@@ -3852,6 +4712,8 @@
    dispatch:
      CPU: set_
      CUDA: set_
@@ -3726,7 +3706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
    variants: method
-@@ -3860,6 +4712,8 @@
+@@ -3860,6 +4722,8 @@
      CPU: legacy::cpu::_th_set_
      CUDA: legacy::cuda::_th_set_
      QuantizedCPU: set_storage
@@ -3735,7 +3715,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
    variants: method
-@@ -3867,12 +4721,16 @@
+@@ -3867,12 +4731,16 @@
    dispatch:
      CPU: set_tensor_
      CUDA: set_tensor_
@@ -3752,7 +3732,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!)
    variants: method
-@@ -3892,6 +4750,8 @@
+@@ -3892,6 +4760,8 @@
    dispatch:
      CPU: masked_fill__cpu
      CUDA: masked_fill__cuda
@@ -3761,7 +3741,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
-@@ -3904,6 +4764,8 @@
+@@ -3904,6 +4774,8 @@
    dispatch:
      CPU: masked_fill__cpu
      CUDA: masked_fill__cuda
@@ -3770,7 +3750,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
-@@ -3916,6 +4778,8 @@
+@@ -3916,6 +4788,8 @@
    dispatch:
      CPU: masked_scatter__cpu
      CUDA: masked_scatter__cuda
@@ -3779,7 +3759,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
    use_c10_dispatcher: full
-@@ -3929,25 +4793,35 @@
+@@ -3929,25 +4803,35 @@
      CUDA: view
      MkldnnCPU: mkldnn_view
      QuantizedCPU: view
@@ -3815,7 +3795,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
    variants: method
-@@ -3955,11 +4829,15 @@
+@@ -3955,11 +4839,15 @@
    dispatch:
      CPU: legacy::cpu::_th_index_fill_
      CUDA: legacy::cuda::_th_index_fill_
@@ -3831,7 +3811,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
    variants: method
-@@ -3967,11 +4845,15 @@
+@@ -3967,11 +4855,15 @@
      CPU: index_fill_
      CUDA: index_fill_
    supports_named_tensor: True
@@ -3847,7 +3827,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
    variants: method
-@@ -3994,6 +4876,8 @@
+@@ -3994,6 +4886,8 @@
    dispatch:
      CPU: scatter_cpu_
      CUDA: legacy::cuda::_th_scatter_
@@ -3856,7 +3836,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
    use_c10_dispatcher: full
-@@ -4004,6 +4888,8 @@
+@@ -4004,6 +4898,8 @@
    dispatch:
      CPU: scatter_fill_cpu_
      CUDA: legacy::cuda::_th_scatter_
@@ -3865,7 +3845,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
    use_c10_dispatcher: full
-@@ -4020,81 +4906,127 @@
+@@ -4020,81 +4916,127 @@
    dispatch:
      CPU: scatter_add_cpu_
      CUDA: legacy::cuda::_th_scatter_add_
@@ -3966,16 +3946,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
    variants: method, function
--
 +  npu_dispatch:
 +    NPU: bitwise_and_npu
-+  
+ 
  - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
    variants: method
--
 +  npu_dispatch:
 +    NPU: bitwise_and_npu_
-+  
+ 
  - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
    variants: method
 +  npu_dispatch:
@@ -3995,7 +3973,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
    variants: method
-@@ -4107,70 +5039,106 @@
+@@ -4107,70 +5049,106 @@
    dispatch:
      CPU: bitwise_or_out
      CUDA: bitwise_or_out
@@ -4102,7 +4080,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
    variants: method
-@@ -4240,18 +5208,24 @@
+@@ -4240,18 +5218,24 @@
  - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
    supports_named_tensor: True
    variants: method
@@ -4127,7 +4105,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: digamma_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4266,6 +5240,8 @@
+@@ -4266,6 +5250,8 @@
    dispatch:
      CPU: legacy::cpu::_th_renorm_
      CUDA: legacy::cuda::_th_renorm_
@@ -4136,7 +4114,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4273,6 +5249,8 @@
+@@ -4273,6 +5259,8 @@
    dispatch:
      CPU: pow_
      CUDA: pow_
@@ -4145,7 +4123,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4280,53 +5258,71 @@
+@@ -4280,53 +5268,71 @@
    dispatch:
      CPU: pow_
      CUDA: pow_
@@ -4206,23 +4184,23 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      CPU: legacy::cpu::_th_addbmm_
      CUDA: legacy::cuda::_th_addbmm_
 +  npu_dispatch:
-+    NPU: addbmm_npu_  
++    NPU: addbmm_npu_
  
  - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
    dispatch:
      CPU: legacy::cpu::_th_addbmm_out
      CUDA: legacy::cuda::_th_addbmm_out
 +  npu_dispatch:
-+    NPU: addbmm_out_npu  
++    NPU: addbmm_out_npu
  
  - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
    use_c10_dispatcher: full
-@@ -4334,28 +5330,40 @@
+@@ -4334,28 +5340,40 @@
    dispatch:
      CPU: legacy::cpu::_th_addbmm
      CUDA: legacy::cuda::_th_addbmm
 +  npu_dispatch:
-+    NPU: addbmm_npu  
++    NPU: addbmm_npu
  
  - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
    variants: method
@@ -4258,7 +4236,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
-@@ -4380,6 +5388,8 @@
+@@ -4380,6 +5398,8 @@
    dispatch:
      CPU: legacy::cpu::_th_diag_out
      CUDA: legacy::cuda::_th_diag_out
@@ -4267,7 +4245,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: diag(Tensor self, int diagonal=0) -> Tensor
    use_c10_dispatcher: full
-@@ -4387,30 +5397,44 @@
+@@ -4387,30 +5407,44 @@
    dispatch:
      CPU: legacy::cpu::_th_diag
      CUDA: legacy::cuda::_th_diag
@@ -4312,7 +4290,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
    dispatch:
-@@ -4435,6 +5459,8 @@
+@@ -4435,6 +5469,8 @@
      CPU: ne_out
      CUDA: ne_out
      QuantizedCPU: ne_out_quantized_cpu
@@ -4321,7 +4299,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4444,6 +5470,8 @@
+@@ -4444,6 +5480,8 @@
      CPU: ne
      CUDA: ne
      QuantizedCPU: ne_quantized_cpu
@@ -4330,7 +4308,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4451,6 +5479,8 @@
+@@ -4451,6 +5489,8 @@
      CPU: ne_out
      CUDA: ne_out
      QuantizedCPU: ne_out_quantized_cpu
@@ -4339,7 +4317,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4460,6 +5490,8 @@
+@@ -4460,6 +5500,8 @@
      CPU: ne
      CUDA: ne
      QuantizedCPU: ne_quantized_cpu
@@ -4348,7 +4326,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4467,6 +5499,8 @@
+@@ -4467,6 +5509,8 @@
      CPU: eq_out
      CUDA: eq_out
      QuantizedCPU: eq_out_quantized_cpu
@@ -4357,7 +4335,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4476,6 +5510,8 @@
+@@ -4476,6 +5520,8 @@
      CPU: eq
      CUDA: eq
      QuantizedCPU: eq_quantized_cpu
@@ -4366,7 +4344,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4483,6 +5519,8 @@
+@@ -4483,6 +5529,8 @@
      CPU: eq_out
      CUDA: eq_out
      QuantizedCPU: eq_out_quantized_cpu
@@ -4375,7 +4353,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4492,6 +5530,8 @@
+@@ -4492,6 +5540,8 @@
      CPU: eq
      CUDA: eq
      QuantizedCPU: eq_quantized_cpu
@@ -4384,7 +4362,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4499,6 +5539,8 @@
+@@ -4499,6 +5549,8 @@
      CPU: ge_out
      CUDA: ge_out
      QuantizedCPU: ge_out_quantized_cpu
@@ -4393,7 +4371,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4508,6 +5550,8 @@
+@@ -4508,6 +5560,8 @@
      CPU: ge
      CUDA: ge
      QuantizedCPU: ge_quantized_cpu
@@ -4402,7 +4380,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4515,6 +5559,8 @@
+@@ -4515,6 +5569,8 @@
      CPU: ge_out
      CUDA: ge_out
      QuantizedCPU: ge_out_quantized_cpu
@@ -4411,7 +4389,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4524,6 +5570,8 @@
+@@ -4524,6 +5580,8 @@
      CPU: ge
      CUDA: ge
      QuantizedCPU: ge_quantized_cpu
@@ -4420,7 +4398,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4531,6 +5579,8 @@
+@@ -4531,6 +5589,8 @@
      CPU: le_out
      CUDA: le_out
      QuantizedCPU: le_out_quantized_cpu
@@ -4429,7 +4407,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4540,6 +5590,8 @@
+@@ -4540,6 +5600,8 @@
      CPU: le
      CUDA: le
      QuantizedCPU: le_quantized_cpu
@@ -4438,7 +4416,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4547,6 +5599,8 @@
+@@ -4547,6 +5609,8 @@
      CPU: le_out
      CUDA: le_out
      QuantizedCPU: le_out_quantized_cpu
@@ -4447,7 +4425,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: le.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4556,6 +5610,8 @@
+@@ -4556,6 +5620,8 @@
      CPU: le
      CUDA: le
      QuantizedCPU: le_quantized_cpu
@@ -4456,7 +4434,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4563,6 +5619,8 @@
+@@ -4563,6 +5629,8 @@
      CPU: gt_out
      CUDA: gt_out
      QuantizedCPU: gt_out_quantized_cpu
@@ -4465,7 +4443,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4572,6 +5630,8 @@
+@@ -4572,6 +5640,8 @@
      CPU: gt
      CUDA: gt
      QuantizedCPU: gt_quantized_cpu
@@ -4474,7 +4452,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4579,6 +5639,8 @@
+@@ -4579,6 +5649,8 @@
      CPU: gt_out
      CUDA: gt_out
      QuantizedCPU: gt_out_quantized_cpu
@@ -4483,7 +4461,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4588,6 +5650,8 @@
+@@ -4588,6 +5660,8 @@
      CPU: gt
      CUDA: gt
      QuantizedCPU: gt_quantized_cpu
@@ -4492,7 +4470,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4595,6 +5659,8 @@
+@@ -4595,6 +5669,8 @@
      CPU: lt_out
      CUDA: lt_out
      QuantizedCPU: lt_out_quantized_cpu
@@ -4501,7 +4479,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
    supports_named_tensor: True
-@@ -4604,6 +5670,8 @@
+@@ -4604,6 +5680,8 @@
      CPU: lt
      CUDA: lt
      QuantizedCPU: lt_quantized_cpu
@@ -4510,7 +4488,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4611,6 +5679,8 @@
+@@ -4611,6 +5689,8 @@
      CPU: lt_out
      CUDA: lt_out
      QuantizedCPU: lt_out_quantized_cpu
@@ -4519,7 +4497,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
    supports_named_tensor: True
-@@ -4620,11 +5690,16 @@
+@@ -4620,11 +5700,16 @@
      CPU: lt
      CUDA: lt
      QuantizedCPU: lt_quantized_cpu
@@ -4536,7 +4514,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: take(Tensor self, Tensor index) -> Tensor
    use_c10_dispatcher: full
-@@ -4632,11 +5707,16 @@
+@@ -4632,11 +5717,16 @@
    dispatch:
      CPU: legacy::cpu::_th_take
      CUDA: legacy::cuda::_th_take
@@ -4553,7 +4531,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
    use_c10_dispatcher: full
-@@ -4646,17 +5726,25 @@
+@@ -4646,17 +5736,25 @@
      CUDA: legacy::cuda::_th_index_select
      SparseCPU: index_select_sparse
      SparseCUDA: index_select_sparse
@@ -4579,7 +4557,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: masked_select(Tensor self, Tensor mask) -> Tensor
    use_c10_dispatcher: full
-@@ -4665,11 +5753,15 @@
+@@ -4665,11 +5763,15 @@
      CPU: masked_select_cpu
      CUDA: masked_select_cuda
    supports_named_tensor: True
@@ -4595,7 +4573,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: nonzero(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -4677,6 +5769,8 @@
+@@ -4677,6 +5779,8 @@
    dispatch:
      CPU: legacy::cpu::_th_nonzero
      CUDA: legacy::cuda::_th_nonzero
@@ -4604,7 +4582,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: nonzero_numpy(Tensor self) -> Tensor[]
    variants: method, function
-@@ -4685,6 +5779,8 @@
+@@ -4685,6 +5789,8 @@
    dispatch:
      CPU: gather_out_cpu
      CUDA: gather_out_cuda
@@ -4613,7 +4591,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
    use_c10_dispatcher: full
-@@ -4692,34 +5788,50 @@
+@@ -4692,34 +5798,50 @@
    dispatch:
      CPU: gather_cpu
      CUDA: gather_cuda
@@ -4621,10 +4599,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +    NPU: gather_npu
  
  - func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
--
 +  npu_dispatch:
 +    NPU: gather_out_npu
-+    
+ 
  - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
    variants: method, function
 +  npu_dispatch:
@@ -4665,7 +4642,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR)
    dispatch:
-@@ -4826,9 +5938,13 @@
+@@ -4826,9 +5948,13 @@
      CUDA: legacy::cuda::_th_potri
  
  - func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
@@ -4679,7 +4656,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _qr_helper(Tensor self, bool some) -> (Tensor, Tensor)
    variants: function
-@@ -4891,12 +6007,16 @@
+@@ -4891,12 +6017,16 @@
    dispatch:
      CPU: multinomial_out
      CUDA: multinomial_out
@@ -4696,7 +4673,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor)
    variants: function
-@@ -4947,6 +6067,8 @@
+@@ -4947,6 +6077,8 @@
    dispatch:
      CPU: erfinv
      CUDA: erfinv
@@ -4705,7 +4682,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
    supports_named_tensor: True
-@@ -4954,26 +6076,36 @@
+@@ -4954,26 +6086,36 @@
    dispatch:
      CPU: _erfinv__cpu
      CUDA: _erfinv__cuda
@@ -4742,7 +4719,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
    use_c10_dispatcher: full
-@@ -4981,21 +6113,29 @@
+@@ -4981,21 +6123,29 @@
  
  - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
    supports_named_tensor: True
@@ -4772,7 +4749,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
    use_c10_dispatcher: full
-@@ -5003,6 +6143,8 @@
+@@ -5003,6 +6153,8 @@
    dispatch:
      CPU: lerp_cpu_scalar
      CUDA: lerp_cuda_scalar
@@ -4781,7 +4758,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
    use_c10_dispatcher: full
-@@ -5010,11 +6152,15 @@
+@@ -5010,11 +6162,15 @@
    dispatch:
      CPU: lerp_cpu_tensor
      CUDA: lerp_cuda_tensor
@@ -4797,7 +4774,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
    use_c10_dispatcher: full
-@@ -5022,11 +6168,15 @@
+@@ -5022,11 +6178,15 @@
    dispatch:
      CPU: legacy::cpu::_th_histc
      CUDA: _histc_cuda
@@ -4813,7 +4790,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
    use_c10_dispatcher: full
-@@ -5034,11 +6184,15 @@
+@@ -5034,11 +6194,15 @@
    dispatch:
      CPU: fmod
      CUDA: legacy::cuda::_th_fmod
@@ -4829,7 +4806,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -5046,11 +6200,15 @@
+@@ -5046,11 +6210,15 @@
    dispatch:
      CPU: fmod
      CUDA: legacy::cuda::_th_fmod
@@ -4845,7 +4822,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
    use_c10_dispatcher: full
-@@ -5058,11 +6216,15 @@
+@@ -5058,11 +6226,15 @@
    dispatch:
      CPU: remainder
      CUDA: remainder
@@ -4861,7 +4838,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
    use_c10_dispatcher: full
-@@ -5070,12 +6232,18 @@
+@@ -5070,12 +6242,18 @@
    dispatch:
      CPU: remainder
      CUDA: remainder
@@ -4880,7 +4857,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: min(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5084,13 +6252,19 @@
+@@ -5084,13 +6262,19 @@
      CPU: min
      CUDA: legacy::cuda::_th_min
      QuantizedCPU: min_quant
@@ -4900,7 +4877,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5099,6 +6273,8 @@
+@@ -5099,6 +6283,8 @@
      CPU: max
      CUDA: legacy::cuda::_th_max
      QuantizedCPU: max_quant
@@ -4909,7 +4886,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: median(Tensor self) -> Tensor
-@@ -5107,12 +6283,16 @@
+@@ -5107,12 +6293,16 @@
    dispatch:
      CPU: median_cpu
      CUDA: median_cuda
@@ -4926,7 +4903,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
    variants: method, function
-@@ -5120,23 +6300,45 @@
+@@ -5120,23 +6310,45 @@
      CPU: legacy::cpu::_th_sort
      CUDA: legacy::cuda::_th_sort
      QuantizedCPU: sort_quant
@@ -4972,7 +4949,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
    variants: method, function
-@@ -5144,11 +6346,15 @@
+@@ -5144,11 +6356,15 @@
      CPU: topk
      CUDA: topk
      QuantizedCPU: quantized_topk_cpu
@@ -4988,7 +4965,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: any(Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5159,11 +6365,15 @@
+@@ -5159,11 +6375,15 @@
      CUDA: any
      SparseCPU: any_sparse
      SparseCUDA: any_sparse
@@ -5004,7 +4981,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
    use_c10_dispatcher: full
-@@ -5171,6 +6381,8 @@
+@@ -5171,6 +6391,8 @@
    dispatch:
      CPU: legacy::cpu::_th_renorm
      CUDA: legacy::cuda::_th_renorm
@@ -5013,7 +4990,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
    variants: method
-@@ -5178,6 +6390,8 @@
+@@ -5178,6 +6400,8 @@
    dispatch:
      CPU: unfold
      CUDA: unfold
@@ -5022,7 +4999,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: equal(Tensor self, Tensor other) -> bool
    use_c10_dispatcher: full
-@@ -5186,6 +6400,8 @@
+@@ -5186,6 +6410,8 @@
      CPU: legacy::cpu::_th_equal
      CUDA: legacy::cuda::_th_equal
      QuantizedCPU: quantized_equal
@@ -5031,7 +5008,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
-@@ -5193,6 +6409,8 @@
+@@ -5193,6 +6419,8 @@
    dispatch:
      CPU: pow_out
      CUDA: pow_out
@@ -5040,7 +5017,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -5201,12 +6419,16 @@
+@@ -5201,12 +6429,16 @@
    dispatch:
      CPU: pow
      CUDA: pow
@@ -5057,7 +5034,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
    use_c10_dispatcher: full
-@@ -5214,6 +6436,8 @@
+@@ -5214,6 +6446,8 @@
    dispatch:
      CPU: pow
      CUDA: pow
@@ -5066,7 +5043,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
    variants: method
-@@ -5221,40 +6445,58 @@
+@@ -5221,40 +6455,58 @@
      CPU: normal_cpu_
      CUDA: normal_cuda_
    supports_named_tensor: True
@@ -5125,7 +5102,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: alias(Tensor(a) self) -> Tensor(a)
    variants: method, function
-@@ -5265,16 +6507,22 @@
+@@ -5265,16 +6517,22 @@
    dispatch:
      CPU: legacy::cpu::_th_addr
      CUDA: legacy::cuda::_th_addr
@@ -5148,7 +5125,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
    dispatch:
-@@ -5286,22 +6534,30 @@
+@@ -5286,22 +6544,30 @@
    dispatch:
      CPU: _cumsum_cpu
      CUDA: legacy::cuda::_th_cumsum
@@ -5179,7 +5156,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _var(Tensor self, bool unbiased=True) -> Tensor
    use_c10_dispatcher: full
-@@ -5309,6 +6565,8 @@
+@@ -5309,6 +6575,8 @@
      CPU: legacy::cpu::_th_var
      CUDA: legacy::cuda::_th_var
    supports_named_tensor: True
@@ -5188,7 +5165,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _std(Tensor self, bool unbiased=True) -> Tensor
    use_c10_dispatcher: full
-@@ -5321,6 +6579,8 @@
+@@ -5321,6 +6589,8 @@
    variants: function
    dispatch:
      CUDA: _amp_non_finite_check_and_unscale_cuda_
@@ -5197,7 +5174,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
    variants: function
-@@ -5332,12 +6592,16 @@
+@@ -5332,12 +6602,16 @@
      CPU: _cat_cpu
      CUDA: cat_cuda
      QuantizedCPU: quantized_cat
@@ -5214,7 +5191,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
    dispatch:
-@@ -5353,36 +6617,50 @@
+@@ -5353,36 +6627,50 @@
    dispatch:
      CPU: legacy::cpu::_th_max
      CUDA: legacy::cuda::_th_max
@@ -5265,7 +5242,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
    use_c10_dispatcher: full
-@@ -5390,23 +6668,33 @@
+@@ -5390,23 +6678,33 @@
    dispatch:
      CPU: mse_loss_backward
      CUDA: mse_loss_backward
@@ -5299,7 +5276,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5434,28 +6722,38 @@
+@@ -5434,22 +6732,30 @@
  
  - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5330,20 +5307,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-   dispatch:
-     CPU: multilabel_margin_loss_backward_cpu_out
-     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out
-+  npu_dispatch:
-+    NPU: multilabel_margin_loss_backward_npu_out
- 
- - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
-   use_c10_dispatcher: full
-@@ -5463,100 +6761,142 @@
-   dispatch:
-     CPU: multilabel_margin_loss_backward_cpu
-     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward
-+  npu_dispatch:
-+    NPU: multilabel_margin_loss_backward_npu
+@@ -5466,97 +6772,137 @@
  
  - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5481,7 +5445,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5564,6 +6904,8 @@
+@@ -5564,6 +6910,8 @@
      CPU: elu_out
      CUDA: elu_out
      QuantizedCPU: quantized_elu_out
@@ -5490,7 +5454,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
    use_c10_dispatcher: full
-@@ -5572,16 +6914,22 @@
+@@ -5572,16 +6920,22 @@
      CPU: elu
      CUDA: elu
      QuantizedCPU: quantized_elu
@@ -5513,7 +5477,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
    python_module: nn
-@@ -5589,12 +6937,16 @@
+@@ -5589,12 +6943,16 @@
      CPU: elu_
      CUDA: elu_
      QuantizedCPU: quantized_elu_
@@ -5530,7 +5494,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: glu(Tensor self, int dim=-1) -> Tensor
    use_c10_dispatcher: full
-@@ -5602,12 +6954,16 @@
+@@ -5602,12 +6960,16 @@
    dispatch:
      CPU: glu
      CUDA: legacy::cuda::_thnn_glu_forward
@@ -5547,7 +5511,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
    use_c10_dispatcher: full
-@@ -5615,20 +6971,30 @@
+@@ -5615,20 +6977,30 @@
    dispatch:
      CPU: glu_backward
      CUDA: legacy::cuda::_thnn_glu_backward
@@ -5578,7 +5542,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5636,6 +7002,8 @@
+@@ -5636,6 +7008,8 @@
      CPU: hardtanh_out
      CUDA: hardtanh_out
      QuantizedCPU: quantized_hardtanh_out
@@ -5587,7 +5551,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
    use_c10_dispatcher: full
-@@ -5644,16 +7012,22 @@
+@@ -5644,16 +7018,22 @@
      CPU: hardtanh
      CUDA: hardtanh
      QuantizedCPU: quantized_hardtanh
@@ -5610,7 +5574,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
    python_module: nn
-@@ -5661,6 +7035,8 @@
+@@ -5661,6 +7041,8 @@
      CPU: hardtanh_
      CUDA: hardtanh_
      QuantizedCPU: quantized_hardtanh_
@@ -5619,7 +5583,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5668,6 +7044,8 @@
+@@ -5668,6 +7050,8 @@
      CPU: leaky_relu_out
      CUDA: leaky_relu_out
      QuantizedCPU: quantized_leaky_relu_out
@@ -5628,7 +5592,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
    use_c10_dispatcher: full
-@@ -5676,10 +7054,14 @@
+@@ -5676,10 +7060,14 @@
      CPU: leaky_relu
      CUDA: leaky_relu
      QuantizedCPU: quantized_leaky_relu
@@ -5643,7 +5607,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
    python_module: nn
-@@ -5687,31 +7069,44 @@
+@@ -5687,31 +7075,44 @@
      CPU: leaky_relu_
      CUDA: leaky_relu_
      QuantizedCPU: quantized_leaky_relu_
@@ -5688,7 +5652,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
    use_c10_dispatcher: full
-@@ -5719,6 +7114,8 @@
+@@ -5719,6 +7120,8 @@
    dispatch:
      CPU: log_sigmoid_backward_cpu
      CUDA: legacy::cuda::_thnn_log_sigmoid_backward
@@ -5697,7 +5661,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5744,37 +7141,53 @@
+@@ -5744,37 +7147,53 @@
  
  - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -5751,7 +5715,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -5782,9 +7195,13 @@
+@@ -5782,9 +7201,13 @@
      CPU: adaptive_avg_pool2d_out_cpu
      CUDA: adaptive_avg_pool2d_out_cuda
      MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
@@ -5765,7 +5729,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
    dispatch:
-@@ -5796,6 +7213,8 @@
+@@ -5796,6 +7219,8 @@
      CPU: adaptive_avg_pool2d_cpu
      CUDA: adaptive_avg_pool2d_cuda
      QuantizedCPU: quantized_adaptive_avg_pool2d
@@ -5774,7 +5738,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5803,24 +7222,32 @@
+@@ -5803,24 +7228,32 @@
    dispatch:
      CPU: adaptive_avg_pool2d_backward_cpu
      CUDA: adaptive_avg_pool2d_backward_cuda
@@ -5807,7 +5771,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
    use_c10_dispatcher: full
-@@ -5828,6 +7255,8 @@
+@@ -5828,6 +7261,8 @@
    dispatch:
      CPU: adaptive_avg_pool3d_backward_cpu
      CUDA: adaptive_avg_pool3d_backward_cuda
@@ -5816,7 +5780,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5835,6 +7264,8 @@
+@@ -5835,6 +7270,8 @@
    dispatch:
      CPU: adaptive_max_pool2d_out_cpu
      CUDA: adaptive_max_pool2d_out_cuda
@@ -5825,7 +5789,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
-@@ -5842,12 +7273,16 @@
+@@ -5842,12 +7279,16 @@
    dispatch:
      CPU: adaptive_max_pool2d_cpu
      CUDA: adaptive_max_pool2d_cuda
@@ -5842,7 +5806,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
    use_c10_dispatcher: full
-@@ -5855,6 +7290,8 @@
+@@ -5855,6 +7296,8 @@
    dispatch:
      CPU: adaptive_max_pool2d_backward_cpu
      CUDA: adaptive_max_pool2d_backward_cuda
@@ -5851,7 +5815,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5889,6 +7326,8 @@
+@@ -5889,6 +7332,8 @@
      CPU: avg_pool2d_out_cpu
      CUDA: avg_pool2d_out_cuda
      MkldnnCPU: mkldnn_avg_pool2d_out
@@ -5860,7 +5824,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
    python_module: nn
-@@ -5897,24 +7336,32 @@
+@@ -5897,24 +7342,32 @@
      CUDA: avg_pool2d_cuda
      MkldnnCPU: mkldnn_avg_pool2d
      QuantizedCPU: quantized_avg_pool2d
@@ -5893,7 +5857,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
    python_module: nn
-@@ -5922,18 +7369,24 @@
+@@ -5922,18 +7375,24 @@
      CPU: avg_pool3d_cpu
      CUDA: avg_pool3d_cuda
      QuantizedCPU: quantized_avg_pool3d
@@ -5918,7 +5882,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -5993,6 +7446,8 @@
+@@ -5993,6 +7452,8 @@
    dispatch:
      CPU: max_pool2d_with_indices_out_cpu
      CUDA: max_pool2d_with_indices_out_cuda
@@ -5927,7 +5891,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -6000,6 +7455,8 @@
+@@ -6000,6 +7461,8 @@
    dispatch:
      CPU: max_pool2d_with_indices_cpu
      CUDA: max_pool2d_with_indices_cuda
@@ -5936,7 +5900,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
-@@ -6007,12 +7464,16 @@
+@@ -6007,12 +7470,16 @@
    dispatch:
      CPU: max_pool2d_with_indices_backward_out_cpu
      CUDA: max_pool2d_with_indices_backward_out_cuda
@@ -5953,7 +5917,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
-@@ -6020,6 +7481,8 @@
+@@ -6020,6 +7487,8 @@
    dispatch:
      CPU: max_pool3d_with_indices_out_cpu
      CUDA: max_pool3d_with_indices_out_cuda
@@ -5962,7 +5926,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Return: (Tensor output, Tensor indices)
  - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-@@ -6027,6 +7490,8 @@
+@@ -6027,6 +7496,8 @@
    dispatch:
      CPU: max_pool3d_with_indices_cpu
      CUDA: max_pool3d_with_indices_cuda
@@ -5971,7 +5935,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    supports_named_tensor: True
  
  - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
-@@ -6034,12 +7499,17 @@
+@@ -6034,12 +7505,17 @@
    dispatch:
      CPU: max_pool3d_with_indices_backward_out_cpu
      CUDA: max_pool3d_with_indices_backward_out_cuda
@@ -5989,7 +5953,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6118,12 +7588,16 @@
+@@ -6118,12 +7594,16 @@
    dispatch:
      CPU: reflection_pad2d_out_cpu
      CUDA: reflection_pad2d_out_cuda
@@ -6006,7 +5970,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6166,12 +7640,16 @@
+@@ -6166,12 +7646,16 @@
    dispatch:
      CPU: replication_pad2d_out_cpu
      CUDA: replication_pad2d_out_cuda
@@ -6023,7 +5987,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6214,12 +7692,16 @@
+@@ -6214,12 +7698,16 @@
    dispatch:
      CPU: upsample_linear1d_out_cpu
      CUDA: upsample_linear1d_out_cuda
@@ -6040,7 +6004,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6232,12 +7714,16 @@
+@@ -6232,12 +7720,16 @@
    dispatch:
      CPU: upsample_linear1d_backward_cpu
      CUDA: upsample_linear1d_backward_cuda
@@ -6057,7 +6021,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6245,42 +7731,56 @@
+@@ -6245,96 +7737,128 @@
      CPU: upsample_bilinear2d_cpu
      CUDA: upsample_bilinear2d_cuda
      QuantizedCPU: quantized_upsample_bilinear2d_cpu
@@ -6085,19 +6049,17 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    dispatch:
      CPU: upsample_bicubic2d_out_cpu
      CUDA: upsample_bicubic2d_out_cuda
--
 +  npu_dispatch:
 +    NPU: upsample_bicubic2d_out_npu
-+  
+ 
  - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
    dispatch:
      CPU: upsample_bicubic2d_cpu
      CUDA: upsample_bicubic2d_cuda
--
 +  npu_dispatch:
 +    NPU: upsample_bicubic2d_npu
-+  
+ 
  - func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
    dispatch:
@@ -6116,7 +6078,38 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_trilinear3d.out(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6311,30 +7811,40 @@
+   dispatch:
+     CPU: upsample_trilinear3d_out_cpu
+     CUDA: upsample_trilinear3d_out_cuda
++  npu_dispatch:
++    NPU: upsample_trilinear3d_out_npu
+ 
+ - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+   python_module: nn
+   dispatch:
+     CPU: upsample_trilinear3d_cpu
+     CUDA: upsample_trilinear3d_cuda
++  npu_dispatch:
++    NPU: upsample_trilinear3d_npu
+ 
+ - func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+   python_module: nn
+   dispatch:
+     CPU: upsample_trilinear3d_backward_out_cpu
+     CUDA: upsample_trilinear3d_backward_out_cuda
++  npu_dispatch:
++    NPU: upsample_trilinear3d_backward_out_npu
+ 
+ - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+   python_module: nn
+   dispatch:
+     CPU: upsample_trilinear3d_backward_cpu
+     CUDA: upsample_trilinear3d_backward_cuda
++  npu_dispatch:
++    NPU: upsample_trilinear3d_backward_npu
+ 
+ - func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+   python_module: nn
    dispatch:
      CPU: upsample_nearest1d_out_cpu
      CUDA: upsample_nearest1d_out_cuda
@@ -6157,7 +6150,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6342,18 +7852,24 @@
+@@ -6342,24 +7866,32 @@
      CPU: upsample_nearest2d_cpu
      CUDA: upsample_nearest2d_cuda
      QuantizedCPU: quantized_upsample_nearest2d_cpu
@@ -6182,7 +6175,39 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest3d.out(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6385,20 +7901,28 @@
+   dispatch:
+     CPU: upsample_nearest3d_out_cpu
+     CUDA: upsample_nearest3d_out_cuda
++  npu_dispatch:
++    NPU: upsample_nearest3d_out_npu
+ 
+ - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+   python_module: nn
+@@ -6367,38 +7899,52 @@
+     CPU: upsample_nearest3d_cpu
+     CUDA: upsample_nearest3d_cuda
+     QuantizedCPU: quantized_upsample_nearest3d_cpu
++  npu_dispatch:
++    NPU: upsample_nearest3d_npu
+ 
+ - func: upsample_nearest3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+   python_module: nn
+   dispatch:
+     CPU: upsample_nearest3d_backward_out_cpu
+     CUDA: upsample_nearest3d_backward_out_cuda
++  npu_dispatch:
++    NPU: upsample_nearest3d_backward_out_npu
+ 
+ - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+   python_module: nn
+   dispatch:
+     CPU: upsample_nearest3d_backward_cpu
+     CUDA: upsample_nearest3d_backward_cuda
++  npu_dispatch:
++    NPU: upsample_nearest3d_backward_npu
+ 
+ - func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+   python_module: nn
    dispatch:
      CPU: sigmoid_backward_out
      CUDA: sigmoid_backward_out
@@ -6211,7 +6236,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # What's a thnn_conv_ versus a slow_conv_?
  #
-@@ -6423,24 +7947,32 @@
+@@ -6423,24 +7969,32 @@
    dispatch:
      CPU: slow_conv_transpose2d_out_cpu
      CUDA: slow_conv_transpose2d_out_cuda
@@ -6244,7 +6269,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6468,21 +8000,29 @@
+@@ -6468,21 +8022,29 @@
  
  - func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -6274,7 +6299,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    python_module: nn
-@@ -6495,32 +8035,46 @@
+@@ -6495,32 +8057,46 @@
    dispatch:
      CPU: slow_conv2d_backward_cpu
      CUDA: legacy::cuda::_thnn_conv2d_backward
@@ -6321,7 +6346,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6553,12 +8107,16 @@
+@@ -6553,12 +8129,16 @@
    dispatch:
      CPU: slow_conv_dilated2d_cpu
      CUDA: slow_conv_dilated2d_cuda
@@ -6338,7 +6363,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
    python_module: nn
-@@ -6577,57 +8135,396 @@
+@@ -6577,57 +8157,393 @@
    dispatch:
      CPU: col2im_out_cpu
      CUDA: col2im_out_cuda
@@ -6540,7 +6565,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  npu_dispatch_only:
 +    NPU: ptiou_npu
 +
-+- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor) 
++- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor)
 +  variants: function
 +  npu_dispatch_only:
 +    NPU: nms_with_mask_npu
@@ -6613,7 +6638,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  variants: function, method
 +  npu_dispatch_only:
 +    NPU: indexing_npu
-+  
++
 +- func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, *, Tensor(a!) out) -> Tensor(a!)
 +  npu_dispatch_only:
 +    NPU: indexing_out_npu
@@ -6642,7 +6667,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- func: npu_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar beta1_power, Scalar beta2_power, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, bool? use_locking, bool? use_nesterov) -> (Tensor(a!), Tensor(b!), Tensor(c!))
 +  npu_dispatch_only:
 +    NPU: apply_adam_npu
-+    
++
 +- func: npu_layer_norm_eval(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05) -> Tensor
 +  npu_dispatch_only:
 +    NPU: layer_norm_eval_npu
@@ -6671,7 +6696,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  npu_dispatch_only:
 +    NPU: confusion_transpose_backward_npu
 +
-+- func: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor
++- func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor
 +  variants: function, method
 +  npu_dispatch_only:
 +    NPU: bmm_v2_npu
@@ -6719,14 +6744,6 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  npu_dispatch_only:
 +    NPU: grid_assign_positive_npu
 +
-+- func: global_step_inc() -> ()
-+  variants: function
-+  use_c10_dispatcher: full
-+
-+- func: set_start_fuzz_compile_step(int step) -> ()
-+  variants: function
-+  use_c10_dispatcher: full
-+
 +- func: npu_mish_backward(Tensor grad, Tensor input) -> Tensor
 +  npu_dispatch_only:
 +    NPU: mish_backward_npu
@@ -6735,10 +6752,15 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  variants: function, method
 +  npu_dispatch_only:
 +    NPU: normalize_batch_npu
++
++- func: npu_masked_fill_range(Tensor self, Tensor start, Tensor end, Tensor value, int axis=-1) -> Tensor
++  variants: function, method
++  npu_dispatch_only:
++    NPU: masked_fill_range_npu
 \ No newline at end of file
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
 --- pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-06-25 16:37:35.566259444 +0800
++++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-05 14:59:26.496336915 +0800
 @@ -659,14 +659,14 @@
  
      SUB x1, x1, 4
@@ -6762,9 +6784,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  5:
      CMP x1, 2
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-06-25 16:37:35.510259016 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-05 14:59:26.440336488 +0800
 @@ -64,7 +64,7 @@
  
  Tensor isinf(const Tensor &self) {
@@ -6774,9 +6796,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return at::zeros_like(self, at::kBool, at::MemoryFormat::Preserve);
    }
    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "isinf", [&]() {
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-06-25 16:37:35.510259016 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-05 14:59:26.444336518 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6819,9 +6841,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else {
      allocator = at::getCPUAllocator();
    }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-06-25 16:37:35.510259016 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-05 14:59:26.444336518 +0800
 @@ -87,6 +87,7 @@
    if (self.is_contiguous(memory_format)) {
      return self;
@@ -6830,9 +6852,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    TORCH_CHECK(
        memory_format != MemoryFormat::Preserve,
        "preserve memory format is unsupported by the contiguous operator");
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-06-25 16:37:35.514259047 +0800
++++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-05 14:59:26.444336518 +0800
 @@ -26,7 +26,7 @@
          const scalar_t* in = &idata[output_y * input_width + output_x];
          scalar_t* out = &odata[output_y * output_width + output_x];
@@ -6842,9 +6864,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            out[0] = in[0];
            in += input_width * input_height;
            out += output_width * output_height;
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py
 --- pytorch-v1.5.0/aten/src/ATen/native_parse.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native_parse.py	2021-06-25 16:37:35.582259566 +0800
++++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-05 14:59:26.512337037 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6880,9 +6902,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  declarations.append(declaration)
              except Exception as e:
                  msg = '''Exception raised in processing function:
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py
 --- pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-06-25 16:37:35.582259566 +0800
++++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-05 14:59:26.512337037 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6912,9 +6934,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  default_backends = ['CPU', 'CUDA']
  
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-06-25 16:37:35.582259566 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-05 14:59:26.512337037 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6945,9 +6967,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    /// Returns if a `Tensor` has HIP backend.
    bool is_hip() const;
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-06-25 16:37:35.582259566 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-05 14:59:26.512337037 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6979,9 +7001,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  inline NamedTensorMeta* Tensor::get_named_tensor_meta() {
    return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
  }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/TH/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-06-25 16:37:35.586259596 +0800
++++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-05 14:59:26.516337067 +0800
 @@ -48,6 +48,11 @@
    ${CMAKE_CURRENT_SOURCE_DIR}
  PARENT_SCOPE)
@@ -6994,9 +7016,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
  
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-06-25 16:37:35.586259596 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-05 14:59:26.520337098 +0800
 @@ -1,9 +1,32 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7103,9 +7125,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return THStorage_(data)(self)[idx];
  }
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-06-25 16:37:35.586259596 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-05 14:59:26.520337098 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7142,9 +7164,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  TH_API THStorage* THStorage_(newWithSize1)(scalar_t);
  TH_API THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags);
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt
 --- pytorch-v1.5.0/c10/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/CMakeLists.txt	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/CMakeLists.txt	2021-07-05 14:59:26.532337189 +0800
 @@ -63,6 +63,14 @@
    message(STATUS "don't use NUMA")
  endif()
@@ -7171,9 +7193,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  if(USE_ROCM)
    # NB: This directory is generated by the HIPIFY script; it's
    # not checked in
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h
 --- pytorch-v1.5.0/c10/core/Backend.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Backend.h	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/core/Backend.h	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7266,9 +7288,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      default:
        return "UNKNOWN_BACKEND";
    }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp
 --- pytorch-v1.5.0/c10/core/Device.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.cpp	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/core/Device.cpp	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7306,9 +7328,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }};
    auto device = std::find_if(
        types.begin(),
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h
 --- pytorch-v1.5.0/c10/core/Device.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.h	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/core/Device.h	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7341,9 +7363,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    /// Return true if the device is of CPU type.
    bool is_cpu() const noexcept {
      return type_ == DeviceType::CPU;
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp
 --- pytorch-v1.5.0/c10/core/DeviceType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.cpp	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7381,9 +7403,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        return true;
      default:
        return false;
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h
 --- pytorch-v1.5.0/c10/core/DeviceType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.h	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/core/DeviceType.h	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7424,9 +7446,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  constexpr DeviceType kHIP = DeviceType::HIP;
  constexpr DeviceType kMSNPU = DeviceType::MSNPU;
  constexpr DeviceType kXLA = DeviceType::XLA;
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp
 --- pytorch-v1.5.0/c10/core/DispatchKey.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.cpp	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7456,9 +7478,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      case DispatchKey::BackendSelect:
        return "BackendSelect";
      case DispatchKey::TESTING_ONLY_GenericModeTensorId:
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h
 --- pytorch-v1.5.0/c10/core/DispatchKey.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.h	2021-06-25 16:37:35.598259688 +0800
++++ pytorch-develop/c10/core/DispatchKey.h	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7488,9 +7510,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    PrivateUse2_TensorId,
    PrivateUse3_TensorId,
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h
 --- pytorch-v1.5.0/c10/core/Storage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Storage.h	2021-06-25 16:37:35.602259718 +0800
++++ pytorch-develop/c10/core/Storage.h	2021-07-05 14:59:26.532337189 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7522,9 +7544,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
   protected:
    c10::intrusive_ptr<StorageImpl> storage_impl_;
  };
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h
 --- pytorch-v1.5.0/c10/core/StorageImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/StorageImpl.h	2021-06-25 16:37:35.602259718 +0800
++++ pytorch-develop/c10/core/StorageImpl.h	2021-07-05 14:59:26.532337189 +0800
 @@ -1,12 +1,39 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7579,9 +7601,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    bool received_cuda() {
      return received_cuda_;
    }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h
 --- pytorch-v1.5.0/c10/core/TensorImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorImpl.h	2021-06-25 16:37:35.602259718 +0800
++++ pytorch-develop/c10/core/TensorImpl.h	2021-07-05 14:59:26.536337219 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7649,9 +7671,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    inline void set_pyobj(PyObject* pyobj) noexcept {
      pyobj_ = pyobj;
    }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h
 --- pytorch-v1.5.0/c10/core/TensorOptions.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorOptions.h	2021-06-25 16:37:35.602259718 +0800
++++ pytorch-develop/c10/core/TensorOptions.h	2021-07-05 14:59:26.536337219 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7690,9 +7712,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else {
      AT_ASSERTM(false, "Unknown DispatchKey: ", tid);
    }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h
 --- pytorch-v1.5.0/c10/macros/Export.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/macros/Export.h	2021-06-25 16:37:35.602259718 +0800
++++ pytorch-develop/c10/macros/Export.h	2021-07-05 14:59:26.536337219 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7726,7 +7748,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #if defined(TORCH_HIP_BUILD_MAIN_LIB)
  #define TORCH_HIP_API C10_EXPORT
  #else
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/caffe2/.clang-format pytorch-develop/caffe2/.clang-format
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/.clang-format pytorch-develop/caffe2/.clang-format
 --- pytorch-v1.5.0/caffe2/.clang-format	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/caffe2/.clang-format	1970-01-01 08:00:00.000000000 +0800
 @@ -1,87 +0,0 @@
@@ -7817,9 +7839,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -TabWidth:        8
 -UseTab:          Never
 -...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt
 --- pytorch-v1.5.0/caffe2/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/caffe2/CMakeLists.txt	2021-06-25 16:37:35.610259779 +0800
++++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-05 14:59:26.544337280 +0800
 @@ -32,6 +32,7 @@
    # Add source, includes, and libs to lists
    list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@@ -7964,9 +7986,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # ---[ Caffe2 HIP sources.
  if(USE_ROCM)
    # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs.
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format
 --- pytorch-v1.5.0/.clang-format	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.clang-format	2021-06-25 16:37:35.478258772 +0800
++++ pytorch-develop/.clang-format	2021-07-05 14:59:26.412336274 +0800
 @@ -84,5 +84,4 @@
  SpacesInSquareBrackets: false
  Standard:        Cpp11
@@ -7975,9 +7997,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -...
 +UseTab:          Never
 \ No newline at end of file
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake
 --- pytorch-v1.5.0/cmake/BuildVariables.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/BuildVariables.cmake	2021-06-25 16:37:35.722260634 +0800
++++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-05 14:59:26.652338104 +0800
 @@ -11,6 +11,7 @@
  # CMakeLists.txt files under each folder respectively.
  set(Caffe2_CPU_SRCS)
@@ -8002,9 +8024,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # This variable contains dependency libraries of Caffe2 which requires whole
  # symbol linkage. One example is the onnx lib where we need all its schema
  # symbols. However, if the lib is whole linked in caffe2 lib, we don't want
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake
 --- pytorch-v1.5.0/cmake/Codegen.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Codegen.cmake	2021-06-25 16:37:35.722260634 +0800
++++ pytorch-develop/cmake/Codegen.cmake	2021-07-05 14:59:26.656338135 +0800
 @@ -191,13 +191,14 @@
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
@@ -8033,9 +8055,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
 +  add_dependencies(ATEN_NPU_FILES_GEN_LIB ATEN_NPU_FILES_GEN_TARGET)
  endif()
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake
 --- pytorch-v1.5.0/cmake/Dependencies.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Dependencies.cmake	2021-06-25 16:37:35.722260634 +0800
++++ pytorch-develop/cmake/Dependencies.cmake	2021-07-05 14:59:26.656338135 +0800
 @@ -1509,6 +1509,13 @@
    ENDIF(NOT C_HAS_THREAD)
  endif()
@@ -8050,9 +8072,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #
  # End ATen checks
  #
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake
 --- pytorch-v1.5.0/cmake/Summary.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Summary.cmake	2021-06-25 16:37:35.722260634 +0800
++++ pytorch-develop/cmake/Summary.cmake	2021-07-05 14:59:26.656338135 +0800
 @@ -134,6 +134,7 @@
    if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
      message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
@@ -8061,9 +8083,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
    message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
  endfunction()
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in
 --- pytorch-v1.5.0/cmake/TorchConfig.cmake.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-06-25 16:37:35.722260634 +0800
++++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-05 14:59:26.656338135 +0800
 @@ -112,6 +112,11 @@
    list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
  endif()
@@ -8076,9 +8098,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # When we build libtorch with the old GCC ABI, dependent libraries must too.
  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
    set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt
 --- pytorch-v1.5.0/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/CMakeLists.txt	2021-06-25 16:37:35.482258803 +0800
++++ pytorch-develop/CMakeLists.txt	2021-07-05 14:59:26.412336274 +0800
 @@ -205,6 +205,10 @@
  option(USE_TBB "Use TBB" OFF)
  option(ONNX_ML "Enable traditional ONNX ML API." ON)
@@ -8143,9 +8165,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  if (APPLE)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-private-field")
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore
 --- pytorch-v1.5.0/.dockerignore	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.dockerignore	2021-06-25 16:37:35.478258772 +0800
++++ pytorch-develop/.dockerignore	2021-07-05 14:59:26.412336274 +0800
 @@ -1,257 +1 @@
 -# READ THIS BEFORE YOU REFACTOR ME
 -#
@@ -8406,9 +8428,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -.clangd/
 +.gitignore
 \ No newline at end of file
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat
 --- pytorch-v1.5.0/docs/make.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/docs/make.bat	2021-06-25 16:37:35.730260695 +0800
++++ pytorch-develop/docs/make.bat	2021-07-05 14:59:26.660338165 +0800
 @@ -1,36 +1,36 @@
 -@ECHO OFF
 -
@@ -8482,7 +8504,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 +:end
 +popd
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/ios/TestApp/.clang-format pytorch-develop/ios/TestApp/.clang-format
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/ios/TestApp/.clang-format pytorch-develop/ios/TestApp/.clang-format
 --- pytorch-v1.5.0/ios/TestApp/.clang-format	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/ios/TestApp/.clang-format	1970-01-01 08:00:00.000000000 +0800
 @@ -1,8 +0,0 @@
@@ -8495,9 +8517,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -ColumnLimit: 100
 -PointerBindsToType: false
 \ No newline at end of file
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt
 --- pytorch-v1.5.0/requirements.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/requirements.txt	2021-06-25 16:37:35.742260786 +0800
++++ pytorch-develop/requirements.txt	2021-07-05 14:59:26.676338287 +0800
 @@ -4,4 +4,12 @@
  requests
  setuptools
@@ -8514,9 +8536,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +Pillow>=5.3.0
 +torchvision
 \ No newline at end of file
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat
 --- pytorch-v1.5.0/scripts/appveyor/install.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install.bat	2021-06-25 16:37:35.742260786 +0800
++++ pytorch-develop/scripts/appveyor/install.bat	2021-07-05 14:59:26.676338287 +0800
 @@ -1,10 +1,10 @@
 -:: Installation scripts for appveyor.
 -
@@ -8538,9 +8560,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +set PATH=C:\Miniconda-x64;C:\Miniconda-x64\Scripts;%PATH%
 +:: Install numpy
 +conda install -y numpy
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat
 --- pytorch-v1.5.0/scripts/appveyor/install_cuda.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-06-25 16:37:35.742260786 +0800
++++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-05 14:59:26.676338287 +0800
 @@ -1,22 +1,22 @@
 -@echo on
 -
@@ -8586,9 +8608,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 +:: Make sure that nvcc is working correctly.
 +nvcc -V || exit /b
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat
 --- pytorch-v1.5.0/scripts/build_windows.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/build_windows.bat	2021-06-25 16:37:35.742260786 +0800
++++ pytorch-develop/scripts/build_windows.bat	2021-07-05 14:59:26.676338287 +0800
 @@ -1,84 +1,84 @@
 -:: #############################################################################
 -:: Example command to build on Windows.
@@ -8758,9 +8780,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +cd %ORIGINAL_DIR%
 +endlocal
 +exit /b 1
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1
 --- pytorch-v1.5.0/scripts/proto.ps1	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/proto.ps1	2021-06-25 16:37:35.742260786 +0800
++++ pytorch-develop/scripts/proto.ps1	2021-07-05 14:59:26.676338287 +0800
 @@ -1,17 +1,17 @@
 -param(
 -  [string]$protoc,
@@ -8796,9 +8818,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 +$cmd = "$protoc -I${dir} --cpp_out=$out $processed"
 +Invoke-Expression $cmd
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/setup.py pytorch-develop/setup.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/setup.py pytorch-develop/setup.py
 --- pytorch-v1.5.0/setup.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/setup.py	2021-06-25 16:37:35.742260786 +0800
++++ pytorch-develop/setup.py	2021-07-05 14:59:26.676338287 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -8866,17 +8888,18 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  'include/caffe2/utils/*.h',
                  'include/caffe2/utils/**/*.h',
                  'include/c10/*.h',
-@@ -811,6 +838,9 @@
+@@ -811,6 +838,10 @@
                  'include/c10/cuda/impl/*.h',
                  'include/c10/hip/*.h',
                  'include/c10/hip/impl/*.h',
 +                'include/c10/npu/*.h',
++                'include/c10/npu/interface/*.h',
 +                'include/c10/npu/impl/*.h',
 +                'include/c10/npu/sys_ctrl/*.h',
                  'include/caffe2/**/*.h',
                  'include/torch/*.h',
                  'include/torch/csrc/*.h',
-@@ -862,6 +892,9 @@
+@@ -862,6 +893,9 @@
                  'include/THH/*.cuh',
                  'include/THH/*.h*',
                  'include/THH/generic/*.h',
@@ -8886,7 +8909,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  'share/cmake/ATen/*.cmake',
                  'share/cmake/Caffe2/*.cmake',
                  'share/cmake/Caffe2/public/*.cmake',
-@@ -870,6 +903,7 @@
+@@ -870,6 +904,7 @@
                  'share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/*.cmake',
                  'share/cmake/Gloo/*.cmake',
                  'share/cmake/Torch/*.cmake',
@@ -8894,9 +8917,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              ],
              'caffe2': [
                  'python/serialized_test/data/operator_test/*.zip',
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml
 --- pytorch-v1.5.0/tools/autograd/derivatives.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/derivatives.yaml	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-05 14:59:27.812346954 +0800
 @@ -107,6 +107,10 @@
  #
  # NB: The parameter names here MUST be consistent with the parameter names
@@ -8993,9 +9016,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- name: npu_confusion_transpose(Tensor self, int[] perm, int[] shape, bool transpose_first) -> Tensor
 +  self: npu_confusion_transpose_backward(grad, perm, self.sizes(), !transpose_first)
 +
-+- name: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor
-+  self: grad.npu_bmmV2(mat2.transpose(-2, -1))
-+  mat2: npu_bmmV2_mat2_backward(grad, self, mat2.sizes())
++- name: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor
++  self: npu_bmm_v2_mat1_backward(grad, self, mat2, self.sizes())
++  mat2: npu_bmm_v2_mat2_backward(grad, self, mat2, mat2.sizes())
 +
 +- name: npu_deformable_conv2d(Tensor input, Tensor weight, Tensor offset, Tensor? bias, int[2] kernel_size, int[] stride, int[] padding, int[] dilation=[1,1,1,1], int groups=1, int deformable_groups=1, bool modulated=True) -> (Tensor, Tensor)
 +  input, weight, offset, bias: npu_deformable_conv2dbk(input, grad, result1, weight, offset, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated)
@@ -9003,9 +9026,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- name: npu_mish(Tensor self) -> Tensor
 +  self: npu_mish_backward(grad, self)
 \ No newline at end of file
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py
 --- pytorch-v1.5.0/tools/autograd/dump_utils.py	1970-01-01 08:00:00.000000000 +0800
-+++ pytorch-develop/tools/autograd/dump_utils.py	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-05 14:59:27.812346954 +0800
 @@ -0,0 +1,112 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# All rights reserved.
@@ -9119,9 +9142,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  "pin_memory",
 +  "to_device"
 +]
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-05 14:59:27.812346954 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9305,9 +9328,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def uses_single_grad(func):
      return uses_ident(func, 'grad')
 +
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_python_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-05 14:59:27.816346984 +0800
 @@ -1,3 +1,20 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9347,9 +9370,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          # and add to op arg map
          argmap['options'] = {
              'value': argname,
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py
 --- pytorch-v1.5.0/tools/autograd/gen_variable_type.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-05 14:59:27.816346984 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9520,9 +9543,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return body
  
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/Functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-05 14:59:27.816346984 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9563,25 +9586,46 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else if (min) {
      return grad * (self >= *min).type_as(grad);
    } else if (max) {
-@@ -572,6 +592,15 @@
+@@ -572,6 +592,36 @@
    }
  }
  
++Tensor npu_bmm_v2_mat1_backward(const Tensor& grad, const Tensor& mat1, const Tensor& mat2, IntArrayRef sizes) {
++  // da = grad * b^T
++  auto grad_with_full_size = grad;
 +
-+Tensor npu_bmmV2_mat2_backward(const Tensor & grad, const Tensor & mat1, IntArrayRef sizes) {
-+  if (sizes.size() == 2) {
-+    return mat1.reshape({-1, mat1.size(-1)}).t().mm(grad.reshape({-1, grad.size(-1)}));
-+  } else {
-+    return mat1.transpose(-2, -1).npu_bmmV2(grad);
++  std::vector<int64_t> axis_reshape(grad.sizes().begin(), grad.sizes().end());
++  if (mat1.dim() == 1) {
++    axis_reshape.insert(axis_reshape.begin() + axis_reshape.size() - 1, 1);
++  } else if (mat2.dim() == 1) {
++    axis_reshape.insert(axis_reshape.end(), 1);
++  }
++  return grad.view(axis_reshape).npu_bmmV2(mat2.dim() == 1 ? mat2.view({1, mat2.size(0)}) : mat2.transpose(-2, -1), sizes);
++}
++
++Tensor npu_bmm_v2_mat2_backward(const Tensor& grad, const Tensor& mat1, const Tensor& mat2, IntArrayRef sizes) {
++  // db = a^T * grad
++  auto grad_with_full_size = grad;
++
++  std::vector<int64_t> axis_reshape(grad.sizes().begin(), grad.sizes().end());
++  if (mat1.dim() == 1) {
++    axis_reshape.insert(axis_reshape.begin() + axis_reshape.size() - 1, 1);
++  } else if (mat2.dim() == 1) {
++    axis_reshape.insert(axis_reshape.end(), 1);
++  }
++
++  if (mat1.dim() == 1) {
++    return mat1.view({mat1.size(0), 1}).npu_bmmV2(grad.view(axis_reshape), sizes);
 +  }
++  return mat1.transpose(-2, -1).npu_bmmV2(grad.view(axis_reshape), sizes);
 +}
 +
  Tensor _sparse_addmm_sparse_backward(const Tensor& grad, const Tensor& sparse_, const Tensor& dense, const Scalar& alpha) {
    AT_ASSERT(sparse_.is_sparse());
    auto sparse = sparse_.coalesce();
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-05 14:59:27.816346984 +0800
 @@ -22,7 +22,7 @@
  #include "torch/csrc/autograd/generated/variable_factories.h"
  #include "torch/csrc/utils/structseq.h"
@@ -9663,9 +9707,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    pybind11::gil_scoped_release no_gil;
    return torch::randint(low, high, size, options);
  }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-05 14:59:27.816346984 +0800
 @@ -15,7 +15,13 @@
  #include "torch/csrc/cuda/Stream.h"
  #include "torch/csrc/cuda/Event.h"
@@ -9750,9 +9794,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"data_ptr", (PyCFunction)THPVariable_data_ptr, METH_NOARGS, NULL},
    {"dim", (PyCFunction)THPVariable_dim, METH_NOARGS, NULL},
    {"has_names", (PyCFunction)THPVariable_has_names, METH_NOARGS, NULL},
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-05 14:59:27.816346984 +0800
 @@ -1,7 +1,27 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9781,9 +9825,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  // ${generated_comment}
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-05 14:59:27.816346984 +0800
 @@ -1,3 +1,20 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9813,9 +9857,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
    at::Tensor & unpack(Tensor & t, const char * name, int pos);
    const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl
 --- pytorch-v1.5.0/tools/build_variables.bzl	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/build_variables.bzl	2021-06-25 16:37:36.894269574 +0800
++++ pytorch-develop/tools/build_variables.bzl	2021-07-05 14:59:27.816346984 +0800
 @@ -46,6 +46,7 @@
      "torch/csrc/autograd/functions/utils.cpp",
      "torch/csrc/autograd/input_buffer.cpp",
@@ -9824,7 +9868,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      "torch/csrc/autograd/record_function.cpp",
      "torch/csrc/autograd/record_function_ops.cpp",
      "torch/csrc/autograd/saved_variable.cpp",
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/autograd/grad_mode.pyi pytorch-develop/torch/autograd/grad_mode.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/grad_mode.pyi pytorch-develop/torch/autograd/grad_mode.pyi
 --- pytorch-v1.5.0/torch/autograd/grad_mode.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/autograd/grad_mode.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,21 +0,0 @@
@@ -9849,7 +9893,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, mode: bool) -> None: ...
 -    def __enter__(self) -> None: ...
 -    def __exit__(self, *args: Any) -> bool: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/autograd/__init__.pyi pytorch-develop/torch/autograd/__init__.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/__init__.pyi pytorch-develop/torch/autograd/__init__.pyi
 --- pytorch-v1.5.0/torch/autograd/__init__.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/autograd/__init__.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,46 +0,0 @@
@@ -9899,9 +9943,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -_TensorOrTensors = Union[Tensor, Sequence[Tensor]]
 -def backward(tensors: _TensorOrTensors, grad_tensors: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=...) -> None: ...
 -def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py
 --- pytorch-v1.5.0/torch/autograd/profiler.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/autograd/profiler.py	2021-06-25 16:37:36.902269635 +0800
++++ pytorch-develop/torch/autograd/profiler.py	2021-07-05 14:59:27.820347015 +0800
 @@ -1,8 +1,25 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -10372,9 +10416,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      if use_cuda:
          append("CUDA time total: {}".format(format_time(cuda_time_total)))
      return ''.join(result)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt
 --- pytorch-v1.5.0/torch/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/CMakeLists.txt	2021-06-25 16:37:36.898269605 +0800
++++ pytorch-develop/torch/CMakeLists.txt	2021-07-05 14:59:27.816346984 +0800
 @@ -97,6 +97,7 @@
      ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp
      ${TORCH_SRC_DIR}/csrc/utils.cpp
@@ -10404,9 +10448,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  if (USE_NUMPY)
      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NUMPY)
  endif()
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/engine.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-06-25 16:37:36.910269696 +0800
++++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-05 14:59:27.832347106 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10527,9 +10571,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      const auto default_stream = guard.getDefaultStream(leaf_stream.device());
      if (leaf_stream != default_stream) {
        auto event = c10::Event{c10::DeviceType::CUDA};
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-05 14:59:27.832347106 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10559,9 +10603,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        grad_inputs[1] = grad.to(
            src_options,
            /*non_blocking=*/false,
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-05 14:59:27.832347106 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10602,9 +10646,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        .def("shapes", &Event::shapes);
  
    m.def("_enable_profiler", enableProfiler);
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-05 14:59:27.832347106 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10654,9 +10698,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
  
    auto& old_var = buffer[pos];
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-05 14:59:27.832347106 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10850,9 +10894,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
  
  CUDAStubs::~CUDAStubs() = default;
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-05 14:59:27.832347106 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10975,9 +11019,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  TORCH_API void pushRange(std::string name);
  TORCH_API void popRange();
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-05 14:59:27.836347137 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11029,9 +11073,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"is_sparse", (getter)THPVariable_is_sparse, nullptr, nullptr, nullptr},
    {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr},
    {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-05 14:59:27.836347137 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11070,9 +11114,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else {
      value = valueToTensor(self_.options(), py_value, self_device);
    }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h
 --- pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-06-25 16:37:36.914269727 +0800
++++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-05 14:59:27.836347137 +0800
 @@ -168,6 +168,45 @@
    return r.release();
  }
@@ -11119,9 +11163,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  inline PyObject* wrap(at::TensorList tl) {
    auto r = THPObjectPtr{PyTuple_New(tl.size())};
    if (!r) throw python_error();
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-06-25 16:37:36.910269696 +0800
++++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-05 14:59:27.832347106 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11153,9 +11197,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  namespace {
  const Variable & checked_cast_variable(const Tensor & t, const char * name, int pos) {
    if (!t.defined()) {
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-06-25 16:37:36.918269757 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-05 14:59:27.836347137 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11259,9 +11303,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
  
    while (!in_flight.empty()) {
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-06-25 16:37:36.918269757 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-05 14:59:27.836347137 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11316,9 +11360,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work")
        .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted)
        .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-06-25 16:37:36.918269757 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-05 14:59:27.836347137 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11441,9 +11485,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      }
    }
  }
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp
 --- pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-06-25 16:37:36.902269635 +0800
++++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-05 14:59:27.824347045 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11490,9 +11534,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto it = attype_to_py_storage_type.find(attype);
    if (it != attype_to_py_storage_type.end()) {
      return it->second;
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp
 --- pytorch-v1.5.0/torch/csrc/Generator.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Generator.cpp	2021-06-25 16:37:36.902269635 +0800
++++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-05 14:59:27.824347045 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11558,9 +11602,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #else 
      TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA");
  #endif 
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/serialization.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-06-25 16:37:36.918269757 +0800
++++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-05 14:59:27.840347168 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11658,9 +11702,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return storage.release();
  }
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/Storage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-06-25 16:37:36.918269757 +0800
++++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-05 14:59:27.840347168 +0800
 @@ -1,7 +1,25 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11737,9 +11781,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      THPObjectPtr item;
      try {
        for (Py_ssize_t i = 0; i < length; i++) {
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-06-25 16:37:36.918269757 +0800
++++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-05 14:59:27.840347168 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11785,9 +11829,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"data_ptr", (PyCFunction)THPStorage_(dataPtr), METH_NOARGS, nullptr},
    {"is_pinned", (PyCFunction)THPStorage_(isPinned), METH_NOARGS, nullptr},
    {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, nullptr},
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp
 --- pytorch-v1.5.0/torch/csrc/Module.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Module.cpp	2021-06-25 16:37:36.902269635 +0800
++++ pytorch-develop/torch/csrc/Module.cpp	2021-07-05 14:59:27.824347045 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11929,9 +11973,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #endif
  
    auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) {
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-06-25 16:37:36.942269941 +0800
++++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-05 14:59:27.860347320 +0800
 @@ -1,18 +1,35 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12306,9 +12350,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -}} // namespace torch::tensors
 +} // namespace tensors
 +} // namespace torch
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.cpp	2021-06-25 16:37:36.942269941 +0800
++++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-05 14:59:27.860347320 +0800
 @@ -1,6 +1,10 @@
  #include <ATen/core/ivalue.h>
  #include <torch/csrc/utils/init.h>
@@ -12394,9 +12438,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  }
 +}
  } // namespace torch
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h
 --- pytorch-v1.5.0/torch/csrc/utils/init.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.h	2021-06-25 16:37:36.942269941 +0800
++++ pytorch-develop/torch/csrc/utils/init.h	2021-07-05 14:59:27.860347320 +0800
 @@ -8,4 +8,7 @@
  void initThroughputBenchmarkBindings(PyObject* module);
  
@@ -12405,9 +12449,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  PyMethodDef* python_functions();
 +}
  } // namespace torch
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h
 --- pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-06-25 16:37:36.942269941 +0800
++++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-05 14:59:27.864347350 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12440,9 +12484,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
    const std::string &device_str = THPUtils_unpackString(args[i]);
    return at::Device(device_str);
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-06-25 16:37:36.942269941 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-05 14:59:27.864347350 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12471,9 +12515,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    registerLayoutObject((THPLayout*)strided_layout, at::Backend::MSNPU);
    registerLayoutObject((THPLayout*)strided_layout, at::Backend::XLA);
    registerLayoutObject((THPLayout*)strided_layout, at::Backend::QuantizedCPU);
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-06-25 16:37:36.942269941 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-05 14:59:27.864347350 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12607,9 +12651,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  " or ", c10::DispatchKey::XLATensorId,
                  " but got: ", dispatch_key);
    } else if(expected_layout == c10::kSparse) {
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-06-25 16:37:36.942269941 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-05 14:59:27.864347350 +0800
 @@ -1,58 +1,91 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12775,7 +12819,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -}} // namespace torch::utils
 +} // namespace utils
 +} // namespace torch
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/cuda/__init__.pyi pytorch-develop/torch/cuda/__init__.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/cuda/__init__.pyi pytorch-develop/torch/cuda/__init__.pyi
 --- pytorch-v1.5.0/torch/cuda/__init__.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/cuda/__init__.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,41 +0,0 @@
@@ -12820,9 +12864,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def reset_max_memory_cached(device: Optional[_device_t]=...) -> None: ...
 -def set_rng_state(new_state): ...
 -def get_rng_state(): ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py
 --- pytorch-v1.5.0/torch/distributed/distributed_c10d.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-06-25 16:37:36.946269971 +0800
++++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-05 14:59:27.864347350 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -12901,9 +12945,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          else:
              raise RuntimeError("Unsupported distributed backend by group")
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py
 --- pytorch-v1.5.0/torch/distributions/von_mises.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributions/von_mises.py	2021-06-25 16:37:36.946269971 +0800
++++ pytorch-develop/torch/distributions/von_mises.py	2021-07-05 14:59:27.868347381 +0800
 @@ -1,140 +1,140 @@
 -from __future__ import absolute_import, division, print_function
 -
@@ -13185,9 +13229,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +        """
 +        return 1 - (_log_modified_bessel_fn(self.concentration, order=1) -
 +                    _log_modified_bessel_fn(self.concentration, order=0)).exp()
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py
 --- pytorch-v1.5.0/torch/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/__init__.py	2021-06-25 16:37:36.898269605 +0800
++++ pytorch-develop/torch/__init__.py	2021-07-05 14:59:27.816346984 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13228,9 +13272,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +#register npu shutdown hook on exit
 +atexit.register(_npu_shutdown)
 \ No newline at end of file
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-06-25 16:37:36.950270002 +0800
++++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-05 14:59:27.868347381 +0800
 @@ -28,6 +28,10 @@
    option(USE_C10D_NCCL "USE C10D NCCL" ON)
  endif()
@@ -13281,9 +13325,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  if(USE_C10D_MPI)
    target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH})
    copy_header(ProcessGroupMPI.hpp)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-06-25 16:37:36.950270002 +0800
++++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-05 14:59:27.872347411 +0800
 @@ -37,8 +37,11 @@
  SET_TARGET_PROPERTIES(shm PROPERTIES
    PREFIX "lib"
@@ -13297,7 +13341,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  if(UNIX AND NOT APPLE)
    include(CheckLibraryExists)
    # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/common_types.pyi pytorch-develop/torch/nn/common_types.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/common_types.pyi pytorch-develop/torch/nn/common_types.pyi
 --- pytorch-v1.5.0/torch/nn/common_types.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/common_types.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,37 +0,0 @@
@@ -13338,9 +13382,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -# With the proposed 'Literal' feature to Python typing, it might be possible to
 -# eventually eliminate this.
 -_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py
 --- pytorch-v1.5.0/torch/nn/functional.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/functional.py	2021-06-25 16:37:36.954270032 +0800
++++ pytorch-develop/torch/nn/functional.py	2021-07-05 14:59:27.872347411 +0800
 @@ -1611,7 +1611,7 @@
      else:
          output = input.matmul(weight.t())
@@ -13350,7 +13394,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          ret = output
      return ret
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/__init__.pyi pytorch-develop/torch/nn/__init__.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/__init__.pyi pytorch-develop/torch/nn/__init__.pyi
 --- pytorch-v1.5.0/torch/nn/__init__.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/__init__.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,7 +0,0 @@
@@ -13361,9 +13405,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -from . import utils as utils
 -from . import functional as functional
 -from . import parallel as parallel
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py
 --- pytorch-v1.5.0/torch/nn/modules/batchnorm.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-06-25 16:37:36.954270032 +0800
++++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-05 14:59:27.872347411 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13393,9 +13437,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          else:
              self.register_parameter('running_mean', None)
              self.register_parameter('running_var', None)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py
 --- pytorch-v1.5.0/torch/nn/modules/module.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/module.py	2021-06-25 16:37:36.954270032 +0800
++++ pytorch-develop/torch/nn/modules/module.py	2021-07-05 14:59:27.876347442 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13536,9 +13580,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          def convert(t):
              if convert_to_format is not None and t.dim() == 4:
                  return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py
 --- pytorch-v1.5.0/torch/nn/modules/normalization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/normalization.py	2021-06-25 16:37:36.954270032 +0800
++++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-05 14:59:27.876347442 +0800
 @@ -128,13 +128,14 @@
      """
      __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
@@ -13569,9 +13613,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
      def extra_repr(self):
          return '{normalized_shape}, eps={eps}, ' \
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in
 --- pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-06-25 16:37:36.954270032 +0800
++++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-05 14:59:27.876347442 +0800
 @@ -1,60 +1,60 @@
 -from ..init import xavier_uniform_
 -from .activation import MultiheadAttention
@@ -13693,7 +13737,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +    activation: Any = ...
 +    def __init__(self, d_model: Any, nhead: Any, dim_feedforward: int = ..., dropout: float = ..., activation: str = ...) -> None: ...
 +    def forward(self, tgt: Any, memory: Any, tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/common_types.pyi pytorch-develop/torch/nn/parallel/common_types.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/common_types.pyi pytorch-develop/torch/nn/parallel/common_types.pyi
 --- pytorch-v1.5.0/torch/nn/parallel/common_types.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parallel/common_types.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -13702,7 +13746,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -_device_t = Union[int, device]
 -_devices_t = Sequence[_device_t]
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/data_parallel.pyi pytorch-develop/torch/nn/parallel/data_parallel.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/data_parallel.pyi pytorch-develop/torch/nn/parallel/data_parallel.pyi
 --- pytorch-v1.5.0/torch/nn/parallel/data_parallel.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parallel/data_parallel.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,23 +0,0 @@
@@ -13729,9 +13773,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def data_parallel(module: Module, inputs: Any, device_ids: Optional[_devices_t] = ...,
 -                  output_device: Optional[_device_t] = ..., dim: int = ...,
 -                  module_kwargs: Optional[Any] = ...) -> Tensor: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py
 --- pytorch-v1.5.0/torch/nn/parallel/distributed.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/parallel/distributed.py	2021-06-25 16:37:36.958270063 +0800
++++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-05 14:59:27.876347442 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13795,7 +13839,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +                    assert self.is_cuda or self.is_npu, "SyncBatchNorm layers only work with CUDA or NPU modules"
                      layer._specify_ddp_gpu_num(
                          len(self.device_ids) if self.device_ids else 1)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/distributed.pyi pytorch-develop/torch/nn/parallel/distributed.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.pyi pytorch-develop/torch/nn/parallel/distributed.pyi
 --- pytorch-v1.5.0/torch/nn/parallel/distributed.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parallel/distributed.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,27 +0,0 @@
@@ -13826,7 +13870,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def forward(self, *inputs: Any, **kwargs: Any) -> T_co: ...
 -
 -    def __call__(self, *inputs: Any, **kwargs: Any) -> T_co: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/__init__.pyi pytorch-develop/torch/nn/parallel/__init__.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/__init__.pyi pytorch-develop/torch/nn/parallel/__init__.pyi
 --- pytorch-v1.5.0/torch/nn/parallel/__init__.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parallel/__init__.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -13835,7 +13879,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -from .parallel_apply import parallel_apply as parallel_apply
 -from .replicate import replicate as replicate
 -from .scatter_gather import gather as gather, scatter as scatter
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/parallel_apply.pyi pytorch-develop/torch/nn/parallel/parallel_apply.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/parallel_apply.pyi pytorch-develop/torch/nn/parallel/parallel_apply.pyi
 --- pytorch-v1.5.0/torch/nn/parallel/parallel_apply.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parallel/parallel_apply.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,7 +0,0 @@
@@ -13846,7 +13890,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -def parallel_apply(modules: Sequence[Module], inputs: Sequence[Any], kwargs_tup: Optional[Any] = ...,
 -                   devices: Optional[_devices_t] = ...) -> List[Any]: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/replicate.pyi pytorch-develop/torch/nn/parallel/replicate.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/replicate.pyi pytorch-develop/torch/nn/parallel/replicate.pyi
 --- pytorch-v1.5.0/torch/nn/parallel/replicate.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parallel/replicate.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,9 +0,0 @@
@@ -13859,7 +13903,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -def replicate(network: Module[T], devices: Union[_devices_t, Sequence[_devices_t]], detach: bool = ...) -> List[
 -    Module[T]]: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parallel/scatter_gather.pyi pytorch-develop/torch/nn/parallel/scatter_gather.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/scatter_gather.pyi pytorch-develop/torch/nn/parallel/scatter_gather.pyi
 --- pytorch-v1.5.0/torch/nn/parallel/scatter_gather.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parallel/scatter_gather.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,24 +0,0 @@
@@ -13887,7 +13931,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -
 -def gather(outputs: Any, target_device: _device_t, dim: int = ...) -> Any: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/parameter.pyi pytorch-develop/torch/nn/parameter.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parameter.pyi pytorch-develop/torch/nn/parameter.pyi
 --- pytorch-v1.5.0/torch/nn/parameter.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/parameter.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,7 +0,0 @@
@@ -13898,7 +13942,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, data: Tensor=..., requires_grad: builtins.bool=...): ...
 -
 -    ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/clip_grad.pyi pytorch-develop/torch/nn/utils/clip_grad.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/clip_grad.pyi pytorch-develop/torch/nn/utils/clip_grad.pyi
 --- pytorch-v1.5.0/torch/nn/utils/clip_grad.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/utils/clip_grad.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,10 +0,0 @@
@@ -13912,7 +13956,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -
 -def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float): ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/convert_parameters.pyi pytorch-develop/torch/nn/utils/convert_parameters.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/convert_parameters.pyi pytorch-develop/torch/nn/utils/convert_parameters.pyi
 --- pytorch-v1.5.0/torch/nn/utils/convert_parameters.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/utils/convert_parameters.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,8 +0,0 @@
@@ -13924,7 +13968,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -
 -def vector_to_parameters(vec: Tensor, parameters: Iterable[Tensor]) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/__init__.pyi pytorch-develop/torch/nn/utils/__init__.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/__init__.pyi pytorch-develop/torch/nn/utils/__init__.pyi
 --- pytorch-v1.5.0/torch/nn/utils/__init__.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/utils/__init__.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -13933,7 +13977,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    vector_to_parameters as vector_to_parameters
 -from .spectral_norm import remove_spectral_norm as remove_spectral_norm, spectral_norm as spectral_norm
 -from .weight_norm import remove_weight_norm as remove_weight_norm, weight_norm as weight_norm
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/rnn.pyi pytorch-develop/torch/nn/utils/rnn.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/rnn.pyi pytorch-develop/torch/nn/utils/rnn.pyi
 --- pytorch-v1.5.0/torch/nn/utils/rnn.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/utils/rnn.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,74 +0,0 @@
@@ -14011,7 +14055,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -def get_packed_sequence(data: Tensor, batch_sizes: Optional[Tensor], sorted_indices: Optional[Tensor],
 -                        unsorted_indices: Optional[Tensor]) -> PackedSequence: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/spectral_norm.pyi pytorch-develop/torch/nn/utils/spectral_norm.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/spectral_norm.pyi pytorch-develop/torch/nn/utils/spectral_norm.pyi
 --- pytorch-v1.5.0/torch/nn/utils/spectral_norm.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/utils/spectral_norm.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,33 +0,0 @@
@@ -14048,7 +14092,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -
 -def remove_spectral_norm(module: T_module, name: str = ...) -> T_module: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/nn/utils/weight_norm.pyi pytorch-develop/torch/nn/utils/weight_norm.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/utils/weight_norm.pyi pytorch-develop/torch/nn/utils/weight_norm.pyi
 --- pytorch-v1.5.0/torch/nn/utils/weight_norm.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/nn/utils/weight_norm.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,28 +0,0 @@
@@ -14080,9 +14124,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -
 -def remove_weight_norm(module: T_module, name: str = ...) -> T_module: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py
 --- pytorch-v1.5.0/torch/onnx/symbolic_opset9.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-06-25 16:37:36.958270063 +0800
++++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-05 14:59:27.880347472 +0800
 @@ -1621,14 +1621,23 @@
          slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals]
          return g.op('Concat', *slices, axis_i=0)
@@ -14140,7 +14184,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
              state_indices = 2 * i, 2 * i + 2
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adadelta.pyi pytorch-develop/torch/optim/adadelta.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adadelta.pyi pytorch-develop/torch/optim/adadelta.pyi
 --- pytorch-v1.5.0/torch/optim/adadelta.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/adadelta.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14149,7 +14193,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class Adadelta(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., rho: float=..., eps: float=..., weight_decay: float=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adagrad.pyi pytorch-develop/torch/optim/adagrad.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adagrad.pyi pytorch-develop/torch/optim/adagrad.pyi
 --- pytorch-v1.5.0/torch/optim/adagrad.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/adagrad.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14158,9 +14202,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class Adagrad(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., lr_decay: float=..., weight_decay: float=..., initial_accumulator_value: float=...,  eps: float=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py
 --- pytorch-v1.5.0/torch/optim/adamax.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/optim/adamax.py	2021-06-25 16:37:36.958270063 +0800
++++ pytorch-develop/torch/optim/adamax.py	2021-07-05 14:59:27.880347472 +0800
 @@ -80,8 +80,8 @@
                      exp_inf.mul_(beta2).unsqueeze(0),
                      grad.abs().add_(eps).unsqueeze_(0)
@@ -14172,7 +14216,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  bias_correction = 1 - beta1 ** state['step']
                  clr = group['lr'] / bias_correction
  
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adamax.pyi pytorch-develop/torch/optim/adamax.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.pyi pytorch-develop/torch/optim/adamax.pyi
 --- pytorch-v1.5.0/torch/optim/adamax.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/adamax.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14181,7 +14225,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class Adamax(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adam.pyi pytorch-develop/torch/optim/adam.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adam.pyi pytorch-develop/torch/optim/adam.pyi
 --- pytorch-v1.5.0/torch/optim/adam.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/adam.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14190,7 +14234,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class Adam(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/adamw.pyi pytorch-develop/torch/optim/adamw.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamw.pyi pytorch-develop/torch/optim/adamw.pyi
 --- pytorch-v1.5.0/torch/optim/adamw.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/adamw.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14199,7 +14243,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class AdamW(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/asgd.pyi pytorch-develop/torch/optim/asgd.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/asgd.pyi pytorch-develop/torch/optim/asgd.pyi
 --- pytorch-v1.5.0/torch/optim/asgd.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/asgd.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14208,7 +14252,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class ASGD(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., lambd: float=..., alpha: float=..., t0: float=..., weight_decay: float=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/__init__.pyi pytorch-develop/torch/optim/__init__.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/__init__.pyi pytorch-develop/torch/optim/__init__.pyi
 --- pytorch-v1.5.0/torch/optim/__init__.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/__init__.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,13 +0,0 @@
@@ -14225,7 +14269,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -from .rprop import Rprop
 -from .sgd import SGD as SGD
 -from .sparse_adam import SparseAdam
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/lbfgs.pyi pytorch-develop/torch/optim/lbfgs.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/lbfgs.pyi pytorch-develop/torch/optim/lbfgs.pyi
 --- pytorch-v1.5.0/torch/optim/lbfgs.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/lbfgs.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14234,7 +14278,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class LBFGS(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., max_iter: int=..., max_eval: Optional[int]=..., tolerance_grad: float=..., tolerance_change: float=..., history_size: int=..., line_search_fn: Optional[str]=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/lr_scheduler.pyi pytorch-develop/torch/optim/lr_scheduler.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/lr_scheduler.pyi pytorch-develop/torch/optim/lr_scheduler.pyi
 --- pytorch-v1.5.0/torch/optim/lr_scheduler.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/lr_scheduler.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,39 +0,0 @@
@@ -14277,7 +14321,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -class CosineAnnealingWarmRestarts(_LRScheduler):
 -    def __init__(self, optimizer: Optimizer, T_0: int=..., T_mult: int=..., eta_min: int=..., last_epoch: int=...) -> None: ...
 -    def step(self, epoch: Optional[int] = ...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/optimizer.pyi pytorch-develop/torch/optim/optimizer.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/optimizer.pyi pytorch-develop/torch/optim/optimizer.pyi
 --- pytorch-v1.5.0/torch/optim/optimizer.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/optimizer.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,18 +0,0 @@
@@ -14299,7 +14343,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def zero_grad(self) -> None: ...
 -    def step(self, closure: Optional[Callable[[], float]]=...) -> Optional[float]: ...
 -    def add_param_group(self, param_group: dict) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/rmsprop.pyi pytorch-develop/torch/optim/rmsprop.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/rmsprop.pyi pytorch-develop/torch/optim/rmsprop.pyi
 --- pytorch-v1.5.0/torch/optim/rmsprop.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/rmsprop.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14308,7 +14352,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class RMSprop(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=...,  centered: bool=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/rprop.pyi pytorch-develop/torch/optim/rprop.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/rprop.pyi pytorch-develop/torch/optim/rprop.pyi
 --- pytorch-v1.5.0/torch/optim/rprop.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/rprop.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,5 +0,0 @@
@@ -14317,7 +14361,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class Rprop(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/sgd.pyi pytorch-develop/torch/optim/sgd.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/sgd.pyi pytorch-develop/torch/optim/sgd.pyi
 --- pytorch-v1.5.0/torch/optim/sgd.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/sgd.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,4 +0,0 @@
@@ -14325,7 +14369,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class SGD(Optimizer):
 -    def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/optim/sparse_adam.pyi pytorch-develop/torch/optim/sparse_adam.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/sparse_adam.pyi pytorch-develop/torch/optim/sparse_adam.pyi
 --- pytorch-v1.5.0/torch/optim/sparse_adam.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/optim/sparse_adam.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,6 +0,0 @@
@@ -14335,9 +14379,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 -class SparseAdam(Optimizer):
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=...) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py
 --- pytorch-v1.5.0/torch/serialization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/serialization.py	2021-06-25 16:37:36.962270093 +0800
++++ pytorch-develop/torch/serialization.py	2021-07-05 14:59:27.880347472 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14419,9 +14463,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  
  def location_tag(storage):
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py
 --- pytorch-v1.5.0/torch/storage.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/storage.py	2021-06-25 16:37:36.962270093 +0800
++++ pytorch-develop/torch/storage.py	2021-07-05 14:59:27.880347472 +0800
 @@ -7,6 +7,7 @@
  
  class _StorageBase(object):
@@ -14439,9 +14483,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          elif get_sharing_strategy() == 'file_system':
              self._share_filename_()
          else:
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py
 --- pytorch-v1.5.0/torch/tensor.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/tensor.py	2021-06-25 16:37:36.962270093 +0800
++++ pytorch-develop/torch/tensor.py	2021-07-05 14:59:27.880347472 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14501,9 +14545,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          return self
  
      def __reversed__(self):
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py
 --- pytorch-v1.5.0/torch/_tensor_str.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_tensor_str.py	2021-06-25 16:37:36.898269605 +0800
++++ pytorch-develop/torch/_tensor_str.py	2021-07-05 14:59:27.820347015 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14555,9 +14599,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          suffixes.append('device=\'' + str(self.device) + '\'')
  
      has_default_dtype = self.dtype in (torch.get_default_dtype(), torch.int64, torch.bool)
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py
 --- pytorch-v1.5.0/torch/utils/data/dataloader.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/dataloader.py	2021-06-25 16:37:36.966270124 +0800
++++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-05 14:59:27.884347503 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14614,7 +14658,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                        self._pin_memory_thread_done_event))
              pin_memory_thread.daemon = True
              pin_memory_thread.start()
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/dataloader.pyi pytorch-develop/torch/utils/data/dataloader.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.pyi pytorch-develop/torch/utils/data/dataloader.pyi
 --- pytorch-v1.5.0/torch/utils/data/dataloader.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/utils/data/dataloader.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,44 +0,0 @@
@@ -14662,7 +14706,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __len__(self) -> int: ...
 -    def __iter__(self) -> _BaseDataLoaderIter: ...
 -    def __next__(self) -> Any: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/dataset.pyi pytorch-develop/torch/utils/data/dataset.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataset.pyi pytorch-develop/torch/utils/data/dataset.pyi
 --- pytorch-v1.5.0/torch/utils/data/dataset.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/utils/data/dataset.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,32 +0,0 @@
@@ -14698,7 +14742,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, dataset: Dataset[T_co], indices: Sequence[int]) -> None: ...
 -
 -def random_split(dataset: Dataset[T], lengths: Sequence[int]) -> List[Subset[T]]: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/distributed.pyi pytorch-develop/torch/utils/data/distributed.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/distributed.pyi pytorch-develop/torch/utils/data/distributed.pyi
 --- pytorch-v1.5.0/torch/utils/data/distributed.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/utils/data/distributed.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,9 +0,0 @@
@@ -14711,7 +14755,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __iter__(self) -> Iterator[int]: ...
 -    def __len__(self) -> int: ...
 -    def set_epoch(self, epoch: int) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/__init__.pyi pytorch-develop/torch/utils/data/__init__.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/__init__.pyi pytorch-develop/torch/utils/data/__init__.pyi
 --- pytorch-v1.5.0/torch/utils/data/__init__.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/utils/data/__init__.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,7 +0,0 @@
@@ -14722,7 +14766,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    Subset as Subset, random_split as random_split, IterableDataset as IterableDataset, \
 -    ChainDataset as ChainDataset
 -from .dataloader import DataLoader as DataLoader, get_worker_info as get_worker_info
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/sampler.pyi pytorch-develop/torch/utils/data/sampler.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/sampler.pyi pytorch-develop/torch/utils/data/sampler.pyi
 --- pytorch-v1.5.0/torch/utils/data/sampler.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/utils/data/sampler.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,38 +0,0 @@
@@ -14764,9 +14808,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    drop_last: bool
 -
 -    def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None: ...
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py
 --- pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-06-25 16:37:36.966270124 +0800
++++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-05 14:59:27.884347503 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14810,7 +14854,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
      # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on the
      # logic of this function.
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/hooks.pyi pytorch-develop/torch/utils/hooks.pyi
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/hooks.pyi pytorch-develop/torch/utils/hooks.pyi
 --- pytorch-v1.5.0/torch/utils/hooks.pyi	2021-04-10 18:39:32.000000000 +0800
 +++ pytorch-develop/torch/utils/hooks.pyi	1970-01-01 08:00:00.000000000 +0800
 @@ -1,11 +0,0 @@
@@ -14825,9 +14869,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __enter__(self): ...
 -    def __exit__(self, type: Any, value: Any, tb: Any) -> None: ...
 -
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py
 --- pytorch-v1.5.0/torch/utils/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/__init__.py	2021-06-25 16:37:36.966270124 +0800
++++ pytorch-develop/torch/utils/__init__.py	2021-07-05 14:59:27.884347503 +0800
 @@ -1,6 +1,7 @@
  from __future__ import absolute_import, division, print_function, unicode_literals
  
@@ -14836,9 +14880,9 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # Set the module for a given object for nicer printing
  def set_module(obj, mod):
-diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' -Nur '--exclude=README*' pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py
+diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py
 --- pytorch-v1.5.0/torch/_utils.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_utils.py	2021-06-25 16:37:36.898269605 +0800
++++ pytorch-develop/torch/_utils.py	2021-07-05 14:59:27.820347015 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
diff --git a/src/aten/src/ATen/native/native_functions.yaml b/src/aten/src/ATen/native/native_functions.yaml
index 938496a8ec3b4cc849f5e6a96c48fe1e364c0d49..afdda6988a665d514a0694374e86b4b5c061430f 100644
--- a/src/aten/src/ATen/native/native_functions.yaml
+++ b/src/aten/src/ATen/native/native_functions.yaml
@@ -606,7 +606,7 @@
   dispatch:
     CPU: bernoulli_scalar_cpu_
     CUDA: bernoulli_scalar_cuda_
-  supports_named_tensor: True 
+  supports_named_tensor: True
   npu_dispatch:
     NPU: bernoulli_npu_
 
@@ -1040,7 +1040,7 @@
     CUDA: _cosh_out_cuda
   npu_dispatch:
     NPU: cosh_out_npu
-    
+
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
 
@@ -1271,7 +1271,7 @@
 - func: det(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
-    
+
 - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
@@ -1381,7 +1381,7 @@
     CUDA: embedding_renorm_cuda_
   npu_dispatch:
     NPU: embedding_renorm_npu_
-  
+
 - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
   use_c10_dispatcher: full
 
@@ -1657,7 +1657,7 @@
     SparseCPU: floor_divide_sparse
     SparseCUDA: floor_divide_sparse
   supports_named_tensor: True
-  npu_dispatch: 
+  npu_dispatch:
     NPU: floor_divide_npu
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -1668,7 +1668,7 @@
     SparseCPU: floor_divide_sparse_
     SparseCUDA: floor_divide_sparse_
   supports_named_tensor: True
-  npu_dispatch: 
+  npu_dispatch:
     NPU: floor_divide_npu_
 
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1678,13 +1678,13 @@
     SparseCPU: floor_divide_out_sparse_zerodim
     SparseCUDA: floor_divide_out_sparse_zerodim
   supports_named_tensor: True
-  npu_dispatch: 
+  npu_dispatch:
     NPU: floor_divide_out_npu
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
   variants: function, method
   supports_named_tensor: True
-  npu_dispatch: 
+  npu_dispatch:
     NPU: floor_divide_npu
 
 - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -1793,15 +1793,15 @@
 - func: hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   npu_dispatch:
     NPU: hamming_window_npu
-     
+
 - func: hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   npu_dispatch:
     NPU: hamming_window_npu
-     
+
 - func: hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   npu_dispatch:
     NPU: hamming_window_npu
-    
+
 - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
 
@@ -1995,7 +1995,7 @@
   dispatch:
     CPU: kthvalue_out_cpu
     CUDA: kthvalue_out_cuda
-  npu_dispatch: 
+  npu_dispatch:
     NPU: kthvalue_out_npu
 
 - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -2006,7 +2006,7 @@
 
 - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   supports_named_tensor: True
-  npu_dispatch: 
+  npu_dispatch:
     NPU: kthvalue_out_npu
 
 - func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
@@ -2090,16 +2090,22 @@
   use_c10_dispatcher: full
   supports_named_tensor: True
   variants: function, method
+  npu_dispatch:
+    NPU: log10_npu
 
 - func: log10_(Tensor(a!) self) -> Tensor(a!)
   supports_named_tensor: True
   variants: function, method
+  npu_dispatch:
+    NPU: log10_npu_
 
 - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   supports_named_tensor: True
   dispatch:
     CPU: log10_out
     CUDA: log10_out
+  npu_dispatch:
+    NPU: log10_out_npu
 
 - func: log1p(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2730,14 +2736,14 @@
   use_c10_dispatcher: full
   npu_dispatch:
     NPU: _pdist_forward_npu
-    
+
 - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
   use_c10_dispatcher: full
 
 - func: cosine_similarity(Tensor input, Tensor input2, int dim=1, float eps=1e-08) -> Tensor
   use_c10_dispatcher: full
   variants: function
-  
+
 - func: permute(Tensor(a) self, int[] dims) -> Tensor(a)
   variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
@@ -2753,7 +2759,7 @@
 
 - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
   use_c10_dispatcher: full
-    
+
 - func: is_pinned(Tensor self) -> bool
   use_c10_dispatcher: full
   variants: method
@@ -2770,7 +2776,7 @@
 - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
   use_c10_dispatcher: full
   variants: function
- 
+
 - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
 - func: rand.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -3011,7 +3017,7 @@
     CUDA: gelu_cuda
   npu_dispatch:
      NPU: gelu_npu
-     
+
 - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
@@ -3075,8 +3081,12 @@
 
 - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
   use_c10_dispatcher: full
+  npu_dispatch:
+    NPU: celu_npu
 
 - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
+  npu_dispatch:
+    NPU: celu_npu_
 
 - func: sigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3423,14 +3433,14 @@
   npu_dispatch:
     NPU: prod_out_npu
     #NPU: prod_out_npu_ext
-    
+
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
   supports_named_tensor: True
   npu_dispatch:
     NPU: prod_npu
     #NPU: prod_npu_ext
-    
+
 - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   supports_named_tensor: True
   npu_dispatch:
@@ -3799,7 +3809,7 @@
 
 - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
   variants: function
-  
+
 # VariableType::_weight_norm does not want to be given a gap in the autograd graph,
 # so we don't define "dispatch" variants for it.
 - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
@@ -5005,12 +5015,12 @@
   variants: method, function
   npu_dispatch:
     NPU: bitwise_and_npu
-  
+
 - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   variants: method
   npu_dispatch:
     NPU: bitwise_and_npu_
-  
+
 - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
   npu_dispatch:
@@ -5315,14 +5325,14 @@
     CPU: legacy::cpu::_th_addbmm_
     CUDA: legacy::cuda::_th_addbmm_
   npu_dispatch:
-    NPU: addbmm_npu_  
+    NPU: addbmm_npu_
 
 - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: legacy::cpu::_th_addbmm_out
     CUDA: legacy::cuda::_th_addbmm_out
   npu_dispatch:
-    NPU: addbmm_out_npu  
+    NPU: addbmm_out_npu
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
@@ -5331,7 +5341,7 @@
     CPU: legacy::cpu::_th_addbmm
     CUDA: legacy::cuda::_th_addbmm
   npu_dispatch:
-    NPU: addbmm_npu  
+    NPU: addbmm_npu
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
   variants: method
@@ -5794,7 +5804,7 @@
 - func: gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
   npu_dispatch:
     NPU: gather_out_npu
-    
+
 - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
   variants: method, function
   npu_dispatch:
@@ -6752,8 +6762,6 @@
   dispatch:
     CPU: multilabel_margin_loss_backward_cpu_out
     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out
-  npu_dispatch:
-    NPU: multilabel_margin_loss_backward_npu_out
 
 - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
   use_c10_dispatcher: full
@@ -6761,8 +6769,6 @@
   dispatch:
     CPU: multilabel_margin_loss_backward_cpu
     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward
-  npu_dispatch:
-    NPU: multilabel_margin_loss_backward_npu
 
 - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -7757,7 +7763,7 @@
     CUDA: upsample_bicubic2d_out_cuda
   npu_dispatch:
     NPU: upsample_bicubic2d_out_npu
-  
+
 - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
   dispatch:
@@ -7765,7 +7771,7 @@
     CUDA: upsample_bicubic2d_cuda
   npu_dispatch:
     NPU: upsample_bicubic2d_npu
-  
+
 - func: upsample_bicubic2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
@@ -7787,24 +7793,32 @@
   dispatch:
     CPU: upsample_trilinear3d_out_cpu
     CUDA: upsample_trilinear3d_out_cuda
+  npu_dispatch:
+    NPU: upsample_trilinear3d_out_npu
 
 - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_cpu
     CUDA: upsample_trilinear3d_cuda
+  npu_dispatch:
+    NPU: upsample_trilinear3d_npu
 
 - func: upsample_trilinear3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_backward_out_cpu
     CUDA: upsample_trilinear3d_backward_out_cuda
+  npu_dispatch:
+    NPU: upsample_trilinear3d_backward_out_npu
 
 - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_backward_cpu
     CUDA: upsample_trilinear3d_backward_cuda
+  npu_dispatch:
+    NPU: upsample_trilinear3d_backward_npu
 
 - func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -7876,6 +7890,8 @@
   dispatch:
     CPU: upsample_nearest3d_out_cpu
     CUDA: upsample_nearest3d_out_cuda
+  npu_dispatch:
+    NPU: upsample_nearest3d_out_npu
 
 - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -7883,18 +7899,24 @@
     CPU: upsample_nearest3d_cpu
     CUDA: upsample_nearest3d_cuda
     QuantizedCPU: quantized_upsample_nearest3d_cpu
+  npu_dispatch:
+    NPU: upsample_nearest3d_npu
 
 - func: upsample_nearest3d_backward.grad_input(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_backward_out_cpu
     CUDA: upsample_nearest3d_backward_out_cuda
+  npu_dispatch:
+    NPU: upsample_nearest3d_backward_out_npu
 
 - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_backward_cpu
     CUDA: upsample_nearest3d_backward_cuda
+  npu_dispatch:
+    NPU: upsample_nearest3d_backward_npu
 
 - func: sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -8333,7 +8355,7 @@
   npu_dispatch_only:
     NPU: ptiou_npu
 
-- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor) 
+- func: npu_nms_with_mask(Tensor input, Scalar iou_threshold) -> (Tensor, Tensor, Tensor)
   variants: function
   npu_dispatch_only:
     NPU: nms_with_mask_npu
@@ -8406,7 +8428,7 @@
   variants: function, method
   npu_dispatch_only:
     NPU: indexing_npu
-  
+
 - func: npu_indexing.out(Tensor self, int[] begin, int[] end, int[] strides, *, Tensor(a!) out) -> Tensor(a!)
   npu_dispatch_only:
     NPU: indexing_out_npu
@@ -8435,7 +8457,7 @@
 - func: npu_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar beta1_power, Scalar beta2_power, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, bool? use_locking, bool? use_nesterov) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   npu_dispatch_only:
     NPU: apply_adam_npu
-    
+
 - func: npu_layer_norm_eval(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05) -> Tensor
   npu_dispatch_only:
     NPU: layer_norm_eval_npu
@@ -8464,7 +8486,7 @@
   npu_dispatch_only:
     NPU: confusion_transpose_backward_npu
 
-- func: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor
+- func: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor
   variants: function, method
   npu_dispatch_only:
     NPU: bmm_v2_npu
@@ -8512,14 +8534,6 @@
   npu_dispatch_only:
     NPU: grid_assign_positive_npu
 
-- func: global_step_inc() -> ()
-  variants: function
-  use_c10_dispatcher: full
-
-- func: set_start_fuzz_compile_step(int step) -> ()
-  variants: function
-  use_c10_dispatcher: full
-
 - func: npu_mish_backward(Tensor grad, Tensor input) -> Tensor
   npu_dispatch_only:
     NPU: mish_backward_npu
@@ -8527,4 +8541,9 @@
 - func: npu_normalize_batch(Tensor self, Tensor seq_len, int normalize_type=0) -> Tensor
   variants: function, method
   npu_dispatch_only:
-    NPU: normalize_batch_npu
\ No newline at end of file
+    NPU: normalize_batch_npu
+
+- func: npu_masked_fill_range(Tensor self, Tensor start, Tensor end, Tensor value, int axis=-1) -> Tensor
+  variants: function, method
+  npu_dispatch_only:
+    NPU: masked_fill_range_npu
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp b/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp
index 2412d3ca3938353872e4529eae885b4e863af506..85c87f911eea4ff78dabb299ca184b27967a59f7 100644
--- a/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AcosKernelNpu.cpp
@@ -32,13 +32,7 @@ Tensor& acos_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor acos_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   acos_out_npu(result, self);
 
@@ -46,9 +40,7 @@ Tensor acos_npu(const Tensor& self) {
 }
 
 Tensor& acos_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self}, {self});
 
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
diff --git a/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp
index 6c3e7f6abde039ef94e995bb6b3a46159ec7b0ff..921605b4810315bc9af6a73e391c35e7e849d2e6 100644
--- a/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AddbmmKernelNpu.cpp
@@ -12,10 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -45,8 +42,7 @@ Tensor addbmm_npu(
   // calculate the output size
   auto outputSize = addbmm_npu_output_size(self, batch1, batch2, beta, alpha);
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   // calculate the output result of the NPU
   addbmm_out_npu(result, self, batch1, batch2, beta, alpha);
   return result;
@@ -58,9 +54,7 @@ Tensor& addbmm_npu_(
     const Tensor& batch2,
     Scalar beta,
     Scalar alpha) {
-  SmallVector<Tensor, N> inputs = {self, batch1, batch2};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, batch1, batch2}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = addbmm_out_npu(contiguousSelf, contiguousSelf, batch1, batch2, beta, alpha);
diff --git a/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp
index dd2a78db2f3793ab41ba52a3a760c4b0e2142126..3f8eec9f4bc1fcb5110b3f900ad4f1fc42ce6b21 100644
--- a/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AddcdivKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -46,9 +45,7 @@ Tensor addcdiv_npu(
     Scalar value) {
   auto divOutputSize = broadcast_ops_npu_output_size(tensor1, tensor2);
   auto outputSize = broadcast_ops_npu_output_size(self.sizes(), divOutputSize);
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   addcdiv_out_npu(result, self, tensor1, tensor2, value);
 
   return result;
@@ -59,9 +56,7 @@ Tensor& addcdiv_npu_(
     const Tensor& tensor1,
     const Tensor& tensor2,
     Scalar value) {
-  SmallVector<Tensor, N> inputs = {self, tensor1, tensor2};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, tensor1, tensor2}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = addcdiv_out_npu(contiguousSelf, contiguousSelf, tensor1, tensor2, value);
diff --git a/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp
index 802e3514a89827213635ffa433433b9349b057a9..14aacf1fa83747688a17467e50e5eaf4f022e0f4 100644
--- a/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AddcmulKernelNpu.cpp
@@ -50,8 +50,7 @@ Tensor& addcmul_out_npu(
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
+      self,
       outputSize);
 
   OpPipeWithDefinedOut pipe;
@@ -81,10 +80,7 @@ Tensor& addcmul_npu_(
     const Tensor& tensor1,
     const Tensor& tensor2,
     Scalar value) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = addcmul_out_npu_nocheck(
diff --git a/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp
index f10b829d6d5e143ce65efc3d82510069d90b4b65..3e11e9de5ac5307c7a8fc7ceaec6b0dfebeb132a 100644
--- a/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
 
 namespace at {
@@ -64,14 +63,8 @@ Tensor addmv_npu(
     Scalar alpha) {
     
   check_1d(vec, "vec", "addmv");
-  // calculate the output size
   auto outputSize = addmv_npu_output_size(self, mat, vec, beta, alpha);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   addmv_out_npu(result, self, mat, vec, beta, alpha);
 
   return result;
@@ -85,9 +78,7 @@ Tensor& addmv_npu_(
     Scalar alpha) {
     
   check_1d(vec, "vec", "addmv");
-  SmallVector<Tensor, N> inputs = {self, mat, vec};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, mat, vec}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result =
diff --git a/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp
index 7b1fc414c09cdbaba2cf5669c62ae41bbe671dd7..864462afffdc8881e3bfbb29d643d07a29c3fe4d 100644
--- a/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AddrKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
 
 namespace at {
@@ -64,15 +63,8 @@ Tensor _addr_npu(
     const Tensor& vec2,
     Scalar beta,
     Scalar alpha) {
-
-  // calculate the output size
   auto outputSize = addr_npu_output_size(self, vec1, vec2, beta, alpha);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   _addr_out_npu(result, self, vec1, vec2, beta, alpha);
 
   return result;
@@ -95,9 +87,7 @@ Tensor& _addr_npu_(
     const Tensor& vec2,
     Scalar beta,
     Scalar alpha) {
-  SmallVector<Tensor, N> inputs = {self, vec1, vec2};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, vec1, vec2}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result =
diff --git a/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp
index 5ca9cc98ab5568eb8556a3b1dc12496eeff12b46..54f08f5a109dc6fa318c5269d2dfb6414be2aaec 100644
--- a/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AffineGridGeneratorBackwardKernelNpu.cpp
@@ -35,8 +35,7 @@ Tensor& affine_grid_generator_backward_nocheck(
     const Tensor& grad,   
     IntArrayRef size,
     bool align_corners) {
-  Tensor assist = at::empty_with_format(    
-      {size[0], size[2], size[3], 3}, grad.options(), CalcuOpUtil::get_tensor_npu_format(grad));
+  Tensor assist = OpPreparation::ApplyTensor(grad, {size[0], size[2], size[3], 3});
   assist.select(-1, 0).copy_(_linspace_from_neg_one(grad, size[3], align_corners));
   assist.select(-1, 1).copy_(_linspace_from_neg_one(grad, size[2], align_corners).unsqueeze_(-1));
   assist.select(-1, 2).fill_(1);
diff --git a/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp b/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp
index 0b7dcc190988d89ebce7b06d2c94a28a2d18b01e..0393104b09e8b5e64af70ee0e1a402516c856206 100644
--- a/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AffineGridGeneratorKernelNpu.cpp
@@ -17,7 +17,7 @@
 #include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -62,9 +62,7 @@ Tensor affine_grid_generator_npu(
   }
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, theta.options(), CalcuOpUtil::get_tensor_npu_format(theta));
-
+  Tensor result = OpPreparation::ApplyTensor(theta, outputSize);
   // calculate the output result of the NPU
   affine_grid_generator_npu_nocheck(
       result, 
diff --git a/src/aten/src/ATen/native/npu/AllKernelNpu.cpp b/src/aten/src/ATen/native/npu/AllKernelNpu.cpp
index 2d9629558f8f857c4cbc1cf95c5442a75645cc0f..6723ce3709c2c9c6e80d536a2097d1e20011eb56 100644
--- a/src/aten/src/ATen/native/npu/AllKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AllKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "c10/npu/OptionsManager.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp b/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp
index 16a293a53a7790a441d18237900148ebffea0b64..e6dcf1164a9a984daa636435b7b78b9da8137c6f 100644
--- a/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AnyKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp
index df545f701c873af7eb1c59067ec46ea7b5260a90..d6e4d3525ba33276067632519d252d83dbd6703d 100644
--- a/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ArangeKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp
index a762c3b70dd88ffe89bedd97b90a1ed4edc11ec8..d62e8b7a5444a38e46b049135e8b26f032fcd68c 100644
--- a/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp
@@ -32,10 +32,10 @@ Tensor argmin_npu(const Tensor& self, optional<int64_t> dim, bool keepdim) {
   auto outputSize = reduce_ops_npu_output_size(input, realDim, realKeepDim);
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
+  Tensor result = OpPreparation::ApplyTensor(
       outputSize,
       self.options().dtype(at::kInt),
-      CalcuOpUtil::get_tensor_npu_format(self));
+      self);
   SmallVector<int64_t, N> DimVec = {realDim};
   // calculate the output result of the NPU
   OpCommand cmd;
diff --git a/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp
index c072a639a6b2b1992451fc1d76289b49c929ca5d..5f2478cbfd1203f99a2899f4810e74fa826b9e84 100644
--- a/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ArgsortKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include<ATen/NamedTensorUtils.h>
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp b/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp
index 2561ca91eb6bd52eae3ff953ccfe0f4189a5fccf..c580971eb8b7d7b35c99a999ce094b4b1eb4587f 100644
--- a/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AsinKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -32,26 +31,13 @@ Tensor& asin_out_npu(
 }
 
 Tensor asin_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize,
-      self.options(),
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   asin_out_npu(result, self);
-
   return result;
 }
 
 Tensor& asin_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = asin_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp b/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp
index add7140b0f577ad4e7aa8ffcb8e891ef9e78066c..a683f5660c0a8d9d58d82dd4f84bf3105acc960c 100644
--- a/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/Atan2KernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -44,8 +43,7 @@ Tensor& atan2_out_npu(
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
+      self,
       outputSize);
 
   atan2_out_npu_nocheck(result, self, other);
@@ -54,25 +52,14 @@ Tensor& atan2_out_npu(
 }
 
 Tensor atan2_npu(const Tensor& self, const Tensor& other) {
-  // calculate the output size
   auto outputSize = broadcast_ops_npu_output_size(self, other);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize,
-      self.options(),
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   atan2_out_npu_nocheck(result, self, other);
-
   return result;
 }
 
 Tensor& atan2_npu_(Tensor& self, const Tensor& other) {
-  SmallVector<Tensor, N> inputs = {self, other};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, other}, {self});
 
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
diff --git a/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp b/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp
index fdd9e858949a59ea06e554dd100601b681fa8a44..a8e38f109b1a515823350f9b1e2d9e3c9124c466 100644
--- a/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp
@@ -31,23 +31,14 @@ Tensor& atan_out_npu(Tensor& result, const Tensor& self) {
 }
  
 Tensor atan_npu(const Tensor& self) { 
-  //calculate the output size 
-  auto outputSize = input_same_output_size(self);
- 
-  //construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
- 
+  Tensor result = OpPreparation::ApplyTensor(self);
   //calculate the output result of the NPU 
   atan_out_npu(result, self);  
   return result; 
 } 
  
 Tensor& atan_npu_(Tensor& self) { 
-  SmallVector<Tensor, N> inputs = {self}; 
-  SmallVector<Tensor, N> outputs = {self}; 
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
- 
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) { 
     Tensor contiguousSelf = NpuUtils::format_contiguous(self); 
     Tensor result = atan_out_npu(contiguousSelf, contiguousSelf); 
diff --git a/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp b/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp
index 33da1cf9d5f5d09fad5767d7d6bcd5a97db7492a..6128e55d7355b068547e69d271e9c56e28ae8f79 100644
--- a/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BatchNMSKernelNpu.cpp
@@ -28,25 +28,24 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_nms_npu(
     bool change_coordinate_frame,
     bool transpose_box) {
   // construct the output tensor of the NPU
-  Tensor nmsed_boxes = at::empty_with_format(
+  Tensor nmsed_boxes = OpPreparation::ApplyTensor(
       {self.size(0), max_total_size, 4},
       self.options().dtype(at::kHalf),
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  Tensor nmsed_scores = at::empty_with_format(
+      self);
+  Tensor nmsed_scores = OpPreparation::ApplyTensor(
       {self.size(0), max_total_size},
       self.options().dtype(at::kHalf),
-      CalcuOpUtil::get_tensor_npu_format(self));
+      self);
 
-  Tensor nmsed_classes = at::empty_with_format(
+  Tensor nmsed_classes = OpPreparation::ApplyTensor(
       {self.size(0), max_total_size},
       self.options().dtype(at::kHalf),
-      CalcuOpUtil::get_tensor_npu_format(self));
+      self);
 
-  Tensor nmsed_num = at::empty_with_format(
+  Tensor nmsed_num = OpPreparation::ApplyTensor(
       {self.size(0)},
       self.options().dtype(at::kInt),
-      CalcuOpUtil::get_tensor_npu_format(self));
+      self);
 
   OpCommand cmd;
   cmd.Name("BatchMultiClassNonMaxSuppression")
diff --git a/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp b/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp
index d1110053a4d5bf8a49420e70a886a8444db5ad4c..a01096cd57982596ce6166413db67531d2fc6258 100644
--- a/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -44,10 +43,7 @@ Tensor& bernoulli_out_npu(Tensor& result, const Tensor& self, const Tensor& p) {
 }
 
 Tensor& bernoulli_npu_(Tensor& self, double p, Generator* gen) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self}, {self});
   ScalarType selfType = self.scalar_type();
   Tensor selfFp32 = self;
   if (self.scalar_type() == ScalarType::Half) {
@@ -70,10 +66,7 @@ Tensor& bernoulli_npu_(Tensor& self, double p, Generator* gen) {
 }
 
 Tensor& bernoulli_npu_(Tensor& self, const Tensor& p, Generator* gen) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self}, {self});
   ScalarType selfType = self.scalar_type();
   Tensor selfFp32 = self;
   Tensor pFp32 = OpPreparation::CastBackToOriFormat(p);;
diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp
index 30da73577e8f7e86ba0b8398ac7df06324c2a58c..692f92cea603aeb931f6cc6b19c668d8404f1e89 100644
--- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -62,13 +61,7 @@ Tensor binary_cross_entropy_backward_npu(
     const Tensor& target,
     const Tensor& weight,
     int64_t reduction) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor gradInput = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor gradInput = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   binary_cross_entropy_backward_out_npu(
       gradInput, grad_output, self, target, weight, reduction);
diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp
index e43b43d80c12dd7b638c31cad4a22c6d7b94b086..722dc7e2e8c874ec311083ad2b945909c3787f37 100644
--- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -68,10 +67,7 @@ Tensor binary_cross_entropy_npu(
   }
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, 
-      self.options(), 
-      CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
 
   // calculate the output result of the NPU
   binary_cross_entropy_out_npu(result, self, target, weight, reduction);
diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp
index 1cd7fc39e695d259d16f5f3736132dd5e5ce6448..beb6f213426f61ff660f1480e5ac29f614313008 100644
--- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -28,12 +27,7 @@ Tensor binary_cross_entropy_with_logits_backward_npu(
     const Tensor& weight,
     const Tensor& pos_weight,
     int64_t reduction) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor gradInput = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor gradInput = OpPreparation::ApplyTensor(self);
 
   // calculate the output result of the NPU
   Tensor weightTensor;
diff --git a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp
index 1dac3030175156b71709b8e03b85145435ec8c52..64f74aa6cdd20f2b00a1dc1161956436e3f8f7e6 100644
--- a/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BinaryCrossEntropyWithLogitsKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp
index b9aabebd61eec8fa64079e5a183a3f451f5521a8..13d019ec4f1ca0ea29600aff1ecbf6f1e47436db 100644
--- a/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BitwiseAndKernelNpu.cpp
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp
index 2ae899ec6971cc6d291c7b2c96c611049dc6eead..e8195e07734cb1345f53eb275fdd4a299d0a6ffc 100644
--- a/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BitwiseNotKernelNpu.cpp
@@ -38,9 +38,7 @@ Tensor& bitwise_not_out_npu(Tensor& result, const Tensor& self) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
diff --git a/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp
index 01770ace14074f8889eb3db78f66e1bf96ce2223..5110d95cf648286c126961255a9d28cd8d491cec 100644
--- a/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BitwiseOrKernelNpu.cpp
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -44,9 +44,7 @@ Tensor& bitwise_or_out_npu(
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   bitwise_or_out_npu_nocheck(result, self, other);
 
@@ -120,11 +118,7 @@ Tensor bitwise_or_npu(const Tensor& self, const Tensor& other) {
   auto outputSize = broadcast_ops_npu_output_size(self, other);
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize,
-      outputTensor.options(),
-      CalcuOpUtil::get_tensor_npu_format(outputTensor));
-
+  Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize);
   // calculate the output result of the NPU
   bitwise_or_out_npu_nocheck(result, self, other);
 
@@ -132,12 +126,7 @@ Tensor bitwise_or_npu(const Tensor& self, const Tensor& other) {
 }
 
 Tensor bitwise_or_npu(const Tensor& self, Scalar other) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self);
 
   // calculate the output result of the NPU
   bitwise_or_out_npu_nocheck(result, self, other);
@@ -146,9 +135,7 @@ Tensor bitwise_or_npu(const Tensor& self, Scalar other) {
 }
 
 Tensor& bitwise_or_npu_(Tensor& self, const Tensor& other) {
-  SmallVector<Tensor, N> inputs = {self, other};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, other}, {self});
 
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
diff --git a/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp
index 915fd2edb31781e46c2b24d3a25981f241ec7fb5..818a660681db5b3a9053bb8a820d9977077342cb 100644
--- a/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -44,9 +44,7 @@ Tensor& bitwise_xor_out_npu(
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   bitwise_xor_out_npu_nocheck(result, self, other);
 
@@ -122,11 +120,7 @@ Tensor bitwise_xor_npu(const Tensor& self, const Tensor& other) {
   auto outputSize = broadcast_ops_npu_output_size(self, other);
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize,
-      outputTensor.options(),
-      CalcuOpUtil::get_tensor_npu_format(outputTensor));
-
+  Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize);
   // calculate the output result of the NPU
   bitwise_xor_out_npu_nocheck(result, self, other);
 
@@ -134,15 +128,7 @@ Tensor bitwise_xor_npu(const Tensor& self, const Tensor& other) {
 }
 
 Tensor bitwise_xor_npu(const Tensor& self, Scalar other) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, 
-      self.options(), 
-      CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   bitwise_xor_out_npu_nocheck(result, self, other);
 
@@ -150,10 +136,7 @@ Tensor bitwise_xor_npu(const Tensor& self, Scalar other) {
 }
 
 Tensor& bitwise_xor_npu_(Tensor& self, const Tensor& other) {
-  SmallVector<Tensor, N> inputs = {self, other};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self, other}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = bitwise_xor_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
diff --git a/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp b/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp
index 4001114d1c2b0eb92951271056c086dbc11548ae..77934e914f229e605ae8d8ece1afa43621c867b1 100644
--- a/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BmmKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "c10/npu/OptionsManager.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp b/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp
index 72a18106bb8794fbee53dde478a61725891ababf..fefd21a0666f050799a5f6ccf63552b9f7c70644 100644
--- a/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BmmV2KernelNpu.cpp
@@ -14,75 +14,281 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
 using namespace at::native::npu;
 
+bool is_transpose_last_two_dims_v2(const Tensor& tensor) {
+  if (tensor.dim() < 2) {
+    return false;
+  }
+  auto storage_size = tensor.storage().get_npu_desc().storage_sizes_;
+  int64_t numel = at::prod_intlist(storage_size);
+
+  int64_t dim1 = tensor.dim() - 1;
+  int64_t dim2 = tensor.dim() - 2;
+
+  auto tensor_desc = tensor.storage().get_npu_desc();
+  if (tensor_desc.base_sizes_.size() == tensor.dim() &&
+      tensor.stride(dim2) == 1 && tensor.stride(dim1) == tensor.size(dim2) &&
+      tensor.size(dim1) == tensor_desc.base_sizes_[dim2] &&
+      tensor.size(dim2) == tensor_desc.base_sizes_[dim1] &&
+      tensor.storage().size() == numel) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 SmallVector<int64_t, SIZE> bmm_v2_output_size(const Tensor& mat1, const Tensor& mat2) {
   auto dim_tensor1 = mat1.dim();
   auto dim_tensor2 = mat2.dim();
-  TORCH_CHECK(dim_tensor1 > 2, "mat1's dim must be greater than 2");
-  TORCH_CHECK(dim_tensor2 >= 2, "mat2's dim must be greater than or equal to 2");
-  if (dim_tensor2 == 2) {
-    auto output_size(array_to_small_vector(mat1.sizes().slice(0, dim_tensor1-1)));
-    output_size.emplace_back(mat2.size(-1));
-    return output_size;
-  } else {
-    TORCH_CHECK(dim_tensor1 == dim_tensor2, "if mat2's dim > 2, mat1's and mat2's batch size must be same");
-    IntArrayRef batch_tensor1(mat1.sizes().data(), std::max<int64_t>(dim_tensor1 - 2, 0));
-    SmallVector<int64_t, SIZE> output_size = array_to_small_vector(batch_tensor1);
-    output_size.emplace_back(mat1.size(-2));
-    output_size.emplace_back(mat2.size(-1));
-    return output_size;
+
+  int64_t m = dim_tensor1 == 1 ? 1 : mat1.size(-2);
+  int64_t n = dim_tensor2 == 1 ? 1 : mat2.size(-1);
+
+  auto batch_a = array_to_small_vector(IntArrayRef(mat1.sizes().data(), std::max<int64_t>(dim_tensor1 - 2, 0)));
+  auto batch_b = array_to_small_vector(IntArrayRef(mat2.sizes().data(), std::max<int64_t>(dim_tensor2 - 2, 0)));
+
+  batch_a.insert(batch_a.begin(), std::max<int64_t>(batch_a.size(), batch_b.size()) - batch_a.size(), 1);
+  batch_b.insert(batch_b.begin(), std::max<int64_t>(batch_a.size(), batch_b.size()) - batch_b.size(), 1);
+
+  SmallVector<int64_t, SIZE> output_size;
+  for (size_t i = 0; i < batch_a.size(); ++i) {
+    if (batch_a[i] == 1) {
+      output_size.emplace_back(batch_b[i]);
+    } else if (batch_b[i] == 1) {
+      output_size.emplace_back(batch_a[i]);
+    } else if (batch_a[i] != batch_b[i]) {
+      AT_ERROR("mat1 and mat2 cannot broadcast, but they are mat1 ",
+          mat1.sizes().data(), " mat2 ", mat2.sizes().data());
+    } else {
+      output_size.emplace_back(batch_a[i]);
+    }
   }
+  output_size.emplace_back(m);
+  output_size.emplace_back(n);
+
+  return output_size;
 }
 
+Tensor pure_bmm_v2_npu(const Tensor& self, const Tensor& mat2, const SmallVector<int64_t, SIZE>& output_size) {
+  auto tensor1 = self.dim() == 1 ? self.view({1, self.size(0)}) : self;
+  auto tensor2 = mat2.dim() == 1 ? mat2.view({mat2.size(0), 1}) : mat2;
 
-Tensor bmm_v2_npu(const Tensor& self, const Tensor& mat2) {
-	auto outputSize = bmm_v2_output_size(self, mat2);
-	Tensor result;
+  Tensor result;
 
-  if ((self.scalar_type() == ScalarType::Float || self.scalar_type() == ScalarType::Half)) {
-    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_FRACTAL_NZ);
+  if ((tensor1.scalar_type() == ScalarType::Half)) {
+    result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_FRACTAL_NZ);
   } else {
-    result = at::empty_with_format(outputSize, self.options(), ACL_FORMAT_ND);
+    result = at::empty_with_format(output_size, tensor1.options(), ACL_FORMAT_ND);
   }
 
-  Tensor contiguousSelf = self;
-  Tensor contiguousMat2 = mat2;
-  if(! CalcuOpUtil::is_transpose_last_two_dims(self)){
-    contiguousSelf = NpuUtils::format_contiguous(self);
+  Tensor contiguous_self = tensor1;
+  Tensor contiguous_mat2 = tensor2;
+  bool is_self_t = is_transpose_last_two_dims_v2(tensor1);
+  bool is_mat2_t = is_transpose_last_two_dims_v2(tensor2);
+
+  if(!is_self_t) {
+    contiguous_self = NpuUtils::format_contiguous(tensor1);
   }
-  if(! CalcuOpUtil::is_transpose_last_two_dims(mat2)){
-    contiguousMat2 = NpuUtils::format_contiguous(mat2);
+  if(!is_mat2_t) {
+    contiguous_mat2 = NpuUtils::format_contiguous(tensor2);
   }
 
-  auto func1 = [&contiguousSelf]() {
+  auto func1 = [&contiguous_self]() {
       bool pass = false;
-      return std::tie(pass, contiguousSelf);
+      return std::tie(pass, contiguous_self);
   };
-  auto func2 = [&contiguousMat2]() {
+  auto func2 = [&contiguous_mat2]() {
       bool pass = false;
-      return std::tie(pass, contiguousMat2);
+      return std::tie(pass, contiguous_mat2);
   };
 
-  bool isSelfT = CalcuOpUtil::is_transpose_last_two_dims(self);
-  bool isMat2T = CalcuOpUtil::is_transpose_last_two_dims(mat2);
-
   // executing the NPU operator
   OpCommand cmd;
   cmd.Name("BatchMatMul")
       .InputWithFunc(func1)
       .InputWithFunc(func2)
       .Output(result)
-      .Attr("adj_x1", isSelfT)
-      .Attr("adj_x2", isMat2T)
+      .Attr("adj_x1", is_self_t)
+      .Attr("adj_x2", is_mat2_t)
       .Run();
 
   return result;
 }
+
+Tensor reshape_tensor_self(const Tensor& self, SmallVector<int64_t, SIZE>& expect_output_size) {
+  // self, expect_output: [5,6,7,17], [1,6,7,65]
+  // self permute + reshape: [5,6,7,17] -> [6,7,5,17] -> [6,7,85]
+  SmallVector<int64_t, SIZE> self_permute_idx;
+  SmallVector<int64_t, SIZE> self_batch_idx;
+
+  for (int64_t i = 0; i < self.dim(); ++i) {
+    if (i < self.dim() - 2) {
+      if (expect_output_size[i] == 1) {
+        self_batch_idx.emplace_back(i);
+        continue;
+      }
+    } else if (i == self.dim() - 1) {
+      for (int64_t j = 0; j < self_batch_idx.size(); ++j) {
+        self_permute_idx.emplace_back(self_batch_idx[j]);
+      }
+    }
+    self_permute_idx.emplace_back(i);
+  }
+  Tensor tmp_self = self.permute(self_permute_idx);
+
+  int64_t m_idx = 0;
+  SmallVector<int64_t, SIZE> tmp_self_size;
+  SmallVector<int64_t, SIZE> tmp_self_size_low;
+
+  m_idx = self.dim() - self_batch_idx.size() - 1;
+  tmp_self_size = array_to_small_vector(tmp_self.sizes());
+  tmp_self_size_low.insert(tmp_self_size_low.end(), tmp_self_size.begin(), tmp_self_size.begin() + m_idx);
+  tmp_self_size_low.emplace_back(-1);
+  tmp_self = tmp_self.reshape(tmp_self_size_low);
+  return tmp_self;
+}
+
+Tensor reshape_tensor_mat2(const Tensor& mat2, SmallVector<int64_t, SIZE>& expect_output_size) {
+  // mat2, expect_output_size: [5,6,17,65], [1,6,7,65]
+  // mat2 permute + reshape: [5,6,17,65] -> [6,5,17,65] -> [6,85,65]
+  SmallVector<int64_t, SIZE> mat2_permute_idx;
+  SmallVector<int64_t, SIZE> mat2_batch_idx;
+
+  for (int64_t i = 0; i < mat2.dim(); ++i) {
+    if (i < mat2.dim() - 2) {
+      if (expect_output_size[i] == 1) {
+        mat2_batch_idx.emplace_back(i);
+        continue;
+      }
+    } else if (i == mat2.dim() - 2) {
+      for (int64_t j = 0; j < mat2_batch_idx.size(); ++j) {
+        mat2_permute_idx.emplace_back(mat2_batch_idx[j]);
+      }
+    }
+    mat2_permute_idx.emplace_back(i);
+  }
+  Tensor tmp_mat2 = mat2.permute(mat2_permute_idx);
+
+  int64_t k_idx = 0;
+  SmallVector<int64_t, SIZE> tmp_mat2_size;
+  SmallVector<int64_t, SIZE> tmp_mat2_size_low;
+
+  k_idx = mat2.dim() - mat2_batch_idx.size() - 2;
+  tmp_mat2_size = array_to_small_vector(tmp_mat2.sizes());
+  tmp_mat2_size_low.insert(tmp_mat2_size_low.end(), tmp_mat2_size.begin(), tmp_mat2_size.begin() + k_idx);
+  tmp_mat2_size_low.insert(tmp_mat2_size_low.end(), {-1, mat2.size(-1)});
+  tmp_mat2 = tmp_mat2.reshape(tmp_mat2_size_low);
+  return tmp_mat2;
+}
+
+SmallVector<int64_t, SIZE> align_small_vector(SmallVector<int64_t, SIZE> svec,
+                                              SmallVector<int64_t, SIZE> golden_svec) {
+  // svec, golden: [6,7,65], [5,6,7,65]
+  // expect: [6,7,65] -> [1,6,7,65]
+  SmallVector<int64_t, SIZE> tmp_svec;
+  tmp_svec = svec;
+  int64_t size_to_fill = golden_svec.size() - svec.size();
+  if (size_to_fill > 0) {
+    tmp_svec.insert(tmp_svec.begin(), size_to_fill, 1);
+  }
+  return tmp_svec;
+}
+
+void expand_tensor(Tensor& self, Tensor& mat2, SmallVector<int64_t, SIZE>& expand_output_size) {
+  self = self.dim() == 1 ? self.view({1, self.size(0)}) : self;
+  mat2 = mat2.dim() == 1 ? mat2.view({mat2.size(0), 1}) : mat2;
+  int64_t m = self.size(-2);
+  int64_t k1 = self.size(-1);
+  int64_t k2 = mat2.size(-2);
+  int64_t n = mat2.size(-1);
+
+  std::vector<int64_t> expand_batch_portion(expand_output_size.begin(), expand_output_size.end() - 2);
+  std::vector<int64_t> self_expand_size(expand_batch_portion);
+  std::vector<int64_t> mat2_expand_size(expand_batch_portion);
+
+  self_expand_size.insert(self_expand_size.end(), {m, k1});
+  mat2_expand_size.insert(mat2_expand_size.end(), {k2, n});
+
+  int64_t expand_batch_product = std::accumulate(expand_batch_portion.begin(), expand_batch_portion.end(),
+                                                 1L, std::multiplies<int64_t>());
+
+  std::vector<int64_t> self_bmm_view({expand_batch_product});
+  std::vector<int64_t> mat2_bmm_view({expand_batch_product});
+  self_bmm_view.insert(self_bmm_view.end(), {m, k1});
+  mat2_bmm_view.insert(mat2_bmm_view.end(), {k2, n});
+
+  self = self.expand(self_expand_size).reshape(self_bmm_view);
+  mat2 = mat2.expand(mat2_expand_size).reshape(mat2_bmm_view);
+}
+
+Tensor bmm_v2_npu(const Tensor& self, const Tensor& mat2, IntArrayRef output_sizes) {
+  auto expect_output_size = array_to_small_vector(output_sizes);
+  auto infer_output_size = bmm_v2_output_size(self, mat2);
+  Tensor tmp_self = self;
+  Tensor tmp_mat2 = mat2;
+
+  // forward propagation
+  if (expect_output_size.empty()) {
+    // avoid some accuracy error caused by transdata
+    expand_tensor(tmp_self, tmp_mat2, infer_output_size);
+    expect_output_size = infer_output_size;
+    infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2);
+
+    auto res = pure_bmm_v2_npu(tmp_self, tmp_mat2, infer_output_size).view(expect_output_size);
+    infer_output_size = expect_output_size;
+
+    if (self.dim() == 1) {
+      // [k][b, k, n] -> [b, 1, n] -> [b, n]
+      infer_output_size.erase(infer_output_size.end() - 2);
+      return res.view(infer_output_size);
+    } else if (mat2.dim() == 1) {
+      // [b, m, k][k] -> [b, m, 1] -> [b, m]
+      infer_output_size.erase(infer_output_size.end() - 1);
+      return res.view(infer_output_size);
+    }
+    return res;
+  }
+
+  // backward propagation
+  SmallVector<int64_t, SIZE> tmp_expect_output_size = expect_output_size;
+  SmallVector<int64_t, SIZE> axis_reduce;
+  SmallVector<int64_t, SIZE> tmp_self_size;
+  SmallVector<int64_t, SIZE> tmp_mat2_size;
+
+  tmp_expect_output_size = align_small_vector(expect_output_size, infer_output_size);
+  for (int i = 0; i < tmp_expect_output_size.size(); ++i) {
+    if (tmp_expect_output_size[i] != infer_output_size[i]) {
+      axis_reduce.emplace_back(i);
+    }
+  }
+
+  // no reduce_sum
+  if (axis_reduce.empty()) {
+    // avoid some accuracy error caused by transdata
+    expand_tensor(tmp_self, tmp_mat2, infer_output_size);
+    infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2);
+    return pure_bmm_v2_npu(tmp_self, tmp_mat2, infer_output_size).view(expect_output_size);
+  }
+
+  // reduce sum without accuracy error
+  tmp_self_size = align_small_vector(array_to_small_vector(self.sizes()), infer_output_size);
+  tmp_mat2_size = align_small_vector(array_to_small_vector(mat2.sizes()), infer_output_size);
+  tmp_self = self.reshape(tmp_self_size);
+  tmp_mat2 = mat2.reshape(tmp_mat2_size);
+  tmp_self = reshape_tensor_self(tmp_self, tmp_expect_output_size);
+  tmp_mat2 = reshape_tensor_mat2(tmp_mat2, tmp_expect_output_size);
+  infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2);
+  // avoid some accuracy error caused by transdata
+  expand_tensor(tmp_self, tmp_mat2, infer_output_size);
+  infer_output_size = bmm_v2_output_size(tmp_self, tmp_mat2);
+  return pure_bmm_v2_npu(tmp_self, tmp_mat2, infer_output_size).view(expect_output_size);
+}
+
 } // namespace native
 } // namespace at
diff --git a/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp
index d75a2c65f49d8962a8793d823504dbb1e3cee104..fa2a86b0871c7d06d9199d4663ff3986736141b3 100644
--- a/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp
@@ -14,9 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <ATen/ATen.h>
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -59,9 +58,7 @@ Tensor bounding_box_decode_npu(
     double wh_ratio_clip) {
   SmallVector<int64_t, SIZE> outputSize = {rois.size(0), 4};
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, rois.options(), CalcuOpUtil::get_tensor_npu_format(rois));
-
+  Tensor result = OpPreparation::ApplyTensor(rois, outputSize);
   SmallVector<float, SIZE> means = {
       static_cast<float>(means0),
       static_cast<float>(means1),
diff --git a/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp
index aa497ce0582c4d05bdf59b90ae6ec0d6d9df3d2b..3e02aad811f780301ab953efbb7f330a3d6ecfae 100644
--- a/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp
@@ -51,11 +51,7 @@ Tensor bounding_box_encode_npu(
     double stds2,
     double stds3) {
   // construct the output tensor of the NPU
-  Tensor delats = at::empty_with_format(
-      {anchor_box.size(0), 4},
-      anchor_box.options(),
-      CalcuOpUtil::get_tensor_npu_format(anchor_box));
-
+  Tensor delats = OpPreparation::ApplyTensor(anchor_box, {anchor_box.size(0), 4});
   SmallVector<float, SIZE> means = {
       static_cast<float>(means0),
       static_cast<float>(means1),
diff --git a/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp b/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp
index 60e9435f100b3703c94ea74c57b70650d3fc6b10..4d280d91394a1df67fc94cb56c033a8a1b34bc78 100644
--- a/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BroadcastKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "c10/npu/OptionsManager.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/CastKernelNpu.cpp b/src/aten/src/ATen/native/npu/CastKernelNpu.cpp
index 83eef18c41f46a8ab5c4312f3de55dc82fbe888a..09606c64483cf80891e5ba915fe7c414928d02a2 100644
--- a/src/aten/src/ATen/native/npu/CastKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CastKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
index 109f0d71d1d0cf49f6ad8d8374eda1d28dbe813f..8c3ac876475bf2928430246cb85f1ab59a121e4e 100644
--- a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
@@ -16,6 +16,7 @@
 
 #include "c10/npu/OptionsManager.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp
index ba36c3baf8410813af92657fd6a18a9e817eebcf..d67326ac651275e4d3514d4ff77fb5033af1e446 100644
--- a/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CdistBackwardKernelNpu.cpp
@@ -12,10 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -81,12 +78,7 @@ Tensor _cdist_backward_npu(
 
   //Executing the NPU operator.
   auto outputSize = input_same_output_size(x1);
-
-  Tensor result = at::empty_with_format(
-    outputSize,
-    tensor1_broadcast.options(),
-    CalcuOpUtil::get_tensor_npu_format(tensor1_broadcast));
-  
+  Tensor result = OpPreparation::ApplyTensor(tensor1_broadcast, outputSize);
   OpCommand cmd;
   cmd.Name("CdistGrad")
       .Input(grad_broadcast)
diff --git a/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp b/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp
index 302d49e6426178896dff91c2f82e647d08aa02f4..660ac93b12581a3dd4764b6b35f01d8ccd23354c 100644
--- a/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CeilKernelNpu.cpp
@@ -33,24 +33,13 @@ Tensor& ceil_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor ceil_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   ceil_out_npu(result, self);
-
   return result;
 }
 
 Tensor& ceil_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = ceil_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/TraceKernelNpu.cpp b/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp
similarity index 36%
rename from src/aten/src/ATen/native/npu/TraceKernelNpu.cpp
rename to src/aten/src/ATen/native/npu/CeluKernelNpu.cpp
index d086e066aa8c511a5daeed1cd9020c94f100b7bc..fc4602ea8554f5cd851623c72283df02dddf158d 100644
--- a/src/aten/src/ATen/native/npu/TraceKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp
@@ -1,4 +1,6 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
 //
 // Licensed under the BSD 3-Clause License  (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,39 +14,46 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
 using namespace at::native::npu;
-SmallVector<NPUTensorDesc, N> trace_npu_input(const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
 
-SmallVector<NPUTensorDesc, N> trace_npu_output(const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
+Tensor celu_out_npu_nocheck(Tensor& result, const Tensor& self, Scalar alpha) {
+  float alpha3 = 1.0;
+  OpCommand cmd;
+  cmd.Name("Celu")
+        .Input(self)
+        .Output(result)
+        .Attr("alpha1", alpha)
+        .Attr("alpha2", alpha)
+        .Attr("alpha3", alpha3)
+        .Run();
+  return result;
 }
 
-SmallVector<NPUAttrDesc, N> trace_npu_attr() {
-  SmallVector<NPUAttrDesc, N> attrs = {};
-  return attrs;
+Tensor celu_out_npu(Tensor& result, const Tensor& self, Scalar alpha) {
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self, &alpha](Tensor& result){celu_out_npu_nocheck(result, self, alpha);})
+   .Call(result);
 }
 
-Tensor& trace_out_npu(Tensor& result, const Tensor& self) {
-  auto inputs = trace_npu_input({self});
-  auto outputs = trace_npu_output({result});
-  auto attrs = trace_npu_attr();
-  CalcuOpUtil::execute_npu_operate("Trace", inputs, outputs, attrs);
+Tensor celu_npu(const Tensor& self, Scalar alpha) {
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
+
+  // calculate the output result of the NPU
+  celu_out_npu(result, self, alpha);
+
   return result;
 }
 
-Tensor trace_npu(const Tensor& self) {
-  auto outputSize = trace_npu_output_size(self);
-  Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  trace_out_npu(result, self);
-  return result.reshape({});
-}
+Tensor& celu_npu_(Tensor& self, Scalar alpha) {
+  celu_out_npu(self, self, alpha);
+  return self;
 }
-}
\ No newline at end of file
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp b/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp
index 0bda2d33b4c005d8823db078b3c5c88e02391ee9..4db76efd8096ea033de19d68b123604ff85a83d1 100644
--- a/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CholeskyKernelNpu.cpp
@@ -14,7 +14,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
 #include "ATen/native/npu/utils/OpTemplate.h"
diff --git a/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp b/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp
index d38894507aedf66fe45da041aa1955cc20dfc3c4..f34b4acab6e077c1a33e05520c3d9ec0984dd494 100644
--- a/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ClampKernelNpu.cpp
@@ -16,8 +16,6 @@
 
 #include <climits>
 #include <float.h>
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
@@ -40,14 +38,11 @@ Tensor& clamp_min_out_npu_nocheck(
     max = NPU_HALF_MAX;
   }
 
-  Tensor minTensor = CalcuOpUtil::CopyScalarToDevice(min, self.scalar_type());
-  Tensor maxTensor = CalcuOpUtil::CopyScalarToDevice(max, self.scalar_type());
-  
   OpCommand cmd;
   cmd.Name("ClipByValue")
       .Input(self)
-      .Input(minTensor)
-      .Input(maxTensor)
+      .Input(min, self.scalar_type())
+      .Input(max, self.scalar_type())
       .Output(result)
       .Run();
   return result;
@@ -60,9 +55,7 @@ Tensor& clamp_min_out_npu(
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
@@ -84,15 +77,12 @@ Tensor& clamp_max_out_npu(Tensor& result, const Tensor& self, Scalar max) {
   } else {
     min = NPU_HALF_MIN;
   }
-  
-  Tensor minTensor = CalcuOpUtil::CopyScalarToDevice(min, self.scalar_type());
-  Tensor maxTensor = CalcuOpUtil::CopyScalarToDevice(max, self.scalar_type());
-  
+
   OpCommand cmd;
   cmd.Name("ClipByValue")
       .Input(self)
-      .Input(minTensor)
-      .Input(maxTensor)
+      .Input(min, self.scalar_type())
+      .Input(max, self.scalar_type())
       .Output(result)
       .Run();
   return result;
@@ -111,15 +101,12 @@ Tensor& clamp_out_npu_nocheck(
     Scalar minScalar = min.value();
     clamp_min_out_npu(result, self, minScalar);
 
-  } else {   
-    Tensor minTensor = CalcuOpUtil::CopyScalarToDevice(min.value(), self.scalar_type());
-    Tensor maxTensor = CalcuOpUtil::CopyScalarToDevice(max.value(), self.scalar_type());
-    
+  } else {
     OpCommand cmd;
     cmd.Name("ClipByValue")
         .Input(self)
-        .Input(minTensor)
-        .Input(maxTensor)
+        .Input(min.value(), self.scalar_type())
+        .Input(max.value(), self.scalar_type())
         .Output(result)
         .Run();   
   }
@@ -135,9 +122,7 @@ Tensor& clamp_out_npu(
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
@@ -148,16 +133,8 @@ Tensor& clamp_out_npu(
 }
 
 Tensor clamp_min_npu(const Tensor& self, Scalar min) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   clamp_min_out_npu_nocheck(result, self, min);
-
   return result;
 }
 
@@ -168,24 +145,14 @@ Tensor& clamp_min_npu_(Tensor& self, Scalar min) {
 }
 
 Tensor clamp_max_npu(const Tensor& self, Scalar max) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   clamp_max_out_npu(result, self, max);
 
   return result;
 }
 
 Tensor& clamp_max_npu_(Tensor& self, Scalar max) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = clamp_max_out_npu(contiguousSelf, contiguousSelf, max);
@@ -201,22 +168,13 @@ Tensor clamp_npu(
     const Tensor& self,
     optional<Scalar> min,
     optional<Scalar> max) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   clamp_out_npu_nocheck(result, self, min, max);
-
   return result;
 }
 
 Tensor& clamp_npu_(Tensor& self, optional<Scalar> min, optional<Scalar> max) {
   clamp_out_npu(self, self, min, max);
-
   return self;
 }
 
diff --git a/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp
index 205b0c5343ed25505ceb265b3393761ca58d8758..12b36826dedf724a54e7c63b83900ad218cf1aba 100644
--- a/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -36,8 +35,7 @@ Tensor confusion_transpose_npu(
   }
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(output_size, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self, output_size);
   OpCommand cmd;
   cmd.Name("ConfusionTransposeD")
       .Input(self)
diff --git a/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp b/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp
index bbef7c1b5cdb4116d54ada0e9b212c2243cbf503..0f0ea1a70715c96d0d5f02cacac78a956b24d24a 100644
--- a/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ConvTbcKernelNpu.cpp
@@ -12,72 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
 using namespace at::native::npu;
 
-SmallVector<NPUTensorDesc, N> conv_tbc_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  SmallVector<Tensor, N> inputTensors;
-  for (int i = 0; i < inputTensor.size(); i++) {
-    if (inputTensor[i].defined()) {
-      inputTensors.emplace_back(inputTensor[i]);
-    }
-  }
-
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensors);
-}
-
-SmallVector<NPUTensorDesc, N> conv_tbc_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> conv_tbc_npu_attr(int64_t pad) {
-  SmallVector<int64_t, N> paddings = {0, 0, pad, pad};
-  SmallVector<int64_t, N> stridesSize = {1, 1, 1, 1};
-  SmallVector<int64_t, N> dilations = {1, 1, 1, 1};
-
-  string dataFormat = "NCHW";
-
-  NPUAttrDesc npuAttrPads = NPUAttrDesc("pads", paddings);
-  NPUAttrDesc npuAttrStrides = NPUAttrDesc("strides", stridesSize);
-  NPUAttrDesc npuAttrDilations = NPUAttrDesc("dilations", dilations);
-  NPUAttrDesc npuAttrDataFormat = NPUAttrDesc("data_format", dataFormat);
-
-  SmallVector<NPUAttrDesc, N> attrs = {
-      npuAttrPads, npuAttrStrides, npuAttrDilations, npuAttrDataFormat};
-
-  return attrs;
-}
-
-Tensor& conv_tbc_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& weight,
-    const Tensor& bias,
-    int64_t pad) {
-  // constructs the input and output NPUTensorDesc
-
-  auto inputs = conv_tbc_npu_input(
-      {self.transpose(0, 2).transpose(0, 1).unsqueeze(2),
-       weight.transpose(0, 2).unsqueeze(2),
-       bias});
-
-  auto outputs = conv_tbc_npu_output({result});
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = conv_tbc_npu_attr(pad);
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("Conv2D", inputs, outputs, attrs);
-
-  return result;
-}
-
 Tensor conv_tbc_npu(
     const Tensor& self,
     const Tensor& weight,
@@ -101,14 +41,32 @@ Tensor conv_tbc_npu(
       "the weight tensor (output channels).");
 
   // calculate the output size
-  auto outputSize = conv_tbc_npu_output_size(self, weight, bias, pad);
+  int64_t Co = weight.size(2);
+  int64_t Wo = (self.size(0) + 2 * pad - (weight.size(0) - 1) - 1) + 1;
+
+  SmallVector<int64_t, SIZE> outputSize = {self.size(1), Co, 1, Wo};
 
   // construct the output tensor of the NPU
-  Tensor result =
-      at::empty_with_format(outputSize, self.options(), ACL_FORMAT_NCHW);
+  Tensor result = OpPreparation::ApplyTensorWithFormat(self, outputSize, ACL_FORMAT_NCHW);
+
+  SmallVector<int64_t, N> paddings = {0, 0, pad, pad};
+  SmallVector<int64_t, N> stridesSize = {1, 1, 1, 1};
+  SmallVector<int64_t, N> dilations = {1, 1, 1, 1};
 
-  // calculate the output result of the NPU
-  conv_tbc_out_npu(result, self, weight, bias, pad);
+  Tensor self_tensor = self.transpose(0, 2).transpose(0, 1).unsqueeze(2);
+  Tensor weight_tensor = weight.transpose(0, 2).unsqueeze(2);
+
+  OpCommand cmd;
+  cmd.Name("Conv2D")
+    .Input(self_tensor)
+    .Input(weight_tensor)
+    .Input(bias)
+    .Output(result)
+    .Attr("pads", paddings)
+    .Attr("strides", stridesSize)
+    .Attr("dilations", dilations)
+    .Attr("data_format", (string)"NCHW")
+    .Run();
 
   result = result.squeeze(2).transpose(0, 2).transpose(1, 2);
   return result;
diff --git a/src/aten/src/ATen/native/npu/CosKernelNpu.cpp b/src/aten/src/ATen/native/npu/CosKernelNpu.cpp
index 835289881dbff440b70b78fa40811db5f03a9554..6874bb77e10b7a69d2578fa05864031f0ddbae74 100644
--- a/src/aten/src/ATen/native/npu/CosKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CosKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -32,26 +31,13 @@ Tensor& cos_out_npu(
 }
 
 Tensor cos_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize,
-      self.options(),
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   cos_out_npu(result, self);
-
   return result;
 }
 
 Tensor& cos_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = cos_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp b/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp
index 6167f72f268b13e34c0c8f973437a8d8a04ae260..1830cc2cae826aa042b5d2ea35a0def2ff4ca769 100644
--- a/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CoshKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -32,9 +31,7 @@ Tensor& cosh_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor& cosh_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self}, {self});
 
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
@@ -48,13 +45,7 @@ Tensor& cosh_npu_(Tensor& self) {
 }
 
 Tensor cosh_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   cosh_out_npu(result, self);
   return result;
diff --git a/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp b/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp
index c9d99226ff804feebe43763bf4ccc9b3c8751087..64ddb33c2eef56fe67d91c0e66759d34b34349c0 100644
--- a/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp
index 77a3b575f9f7fc56809a7ac8a90f1b041ca0b4c0..46be0dcb91449658ce223fc9cb6d268db17db7f1 100644
--- a/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CtcLossBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -50,30 +49,29 @@ Tensor ctc_loss_backward_npu(
   if (logAlpha.scalar_type() == ScalarType::Half) {
     logAlphaNeed = logAlphaNeed.to(ScalarType::Float);
   }
+  
+  Tensor targetsCast = targets;
+  if(targets.scalar_type() == ScalarType::Long){
+    targetsCast = targetsCast.to(ScalarType::Int);
+  }
+  
+  auto inputLengthsTensor = at::tensor(inputLengths, targetsCast.options().dtype(at::kInt));
+  auto targetLengthsTensor = at::tensor(targetLengths, targetsCast.options().dtype(at::kInt));
 
-  // IntArrayRef to Tensor
-  auto inputLengthsTensor = at::tensor(inputLengths, targets.options().dtype(at::kLong));
-  auto targetLengthsTensor = at::tensor(targetLengths, targets.options().dtype(at::kLong));
-
-  // calculate the output size
-  auto outputSize = input_same_output_size(logProbs);
+  auto outputSize = {logProbs.size(1), logProbs.size(0), logProbs.size(2)};
 
   // construct the output tensor of the NPU
-  Tensor grad = at::empty_with_format(
-      outputSize,
-      logProbsNeed.options(),
-      CalcuOpUtil::get_tensor_npu_format(logProbsNeed));
-
+  Tensor grad = OpPreparation::ApplyTensor(logProbsNeed, outputSize);
   // calculate the output result of the NPU
   OpCommand cmd;
   cmd.Name("CTCLossV2Grad")
       .Input(gradOutNeed)
       .Input(logProbsNeed)
-      .Input(targets)
-      .Input(negLogLikelihoodNeed)
-      .Input(logAlphaNeed)
+      .Input(targetsCast)
       .Input(inputLengthsTensor)
-      .Input(targetLengthsTensor)
+      .Input(targetLengthsTensor)      
+      .Input(negLogLikelihoodNeed)
+      .Input(logAlphaNeed)      
       .Output(grad)
       .Attr("blank", blank)
       .Attr("zero_infinity", zeroInfinity)
@@ -82,8 +80,9 @@ Tensor ctc_loss_backward_npu(
   if (gradOut.scalar_type() == ScalarType::Half) {
     grad = grad.to(ScalarType::Half);
   }
-
-  return grad;
+  
+  //return grad;
+  return grad.permute({1,0,2});
 }
 } // namespace native
 } // namespace at
diff --git a/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp b/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp
index e048cfa2388c212aed0bcb09e22a5368e56f1aef..860a00a71cb380a1872c496f38d704afd6ffb44b 100644
--- a/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CtcLossKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -33,12 +33,18 @@ std::tuple<Tensor, Tensor> ctc_loss_npu(
     logProbsNeed = logProbsNeed.to(ScalarType::Float);
   }
   
+  //Aicore supports only the int type
+  Tensor targetsCast = targets;
+  if(targets.scalar_type() == ScalarType::Long){
+    targetsCast = targetsCast.to(ScalarType::Int);
+  }
+  
   // IntArrayRef to Tensor
-  auto inputLengthsTensor = at::tensor(inputLengths, targets.options().dtype(at::kLong));
-  auto targetLengthsTensor = at::tensor(targetLengths, targets.options().dtype(at::kLong));
+  auto inputLengthsTensor = at::tensor(inputLengths, targetsCast.options());
+  auto targetLengthsTensor = at::tensor(targetLengths, targetsCast.options());
   
   // calculate the output size
-  auto outputSizes = ctc_loss_npu_output_size(logProbs, targets, targetLengths);
+  auto outputSizes = ctc_loss_npu_output_size(logProbs, targetsCast, targetLengths);
 
   // construct the output tensor of the NPU
   Tensor negLogLikelihood = at::empty_with_format(
@@ -55,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_npu(
   OpCommand cmd;
   cmd.Name("CTCLossV2")
       .Input(logProbsNeed)
-      .Input(targets)
+      .Input(targetsCast)
       .Input(inputLengthsTensor)
       .Input(targetLengthsTensor)
       .Output(negLogLikelihood)
diff --git a/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp b/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp
index e7977bd3fbe0ce1a8253d24682ae62a2d78b087f..adf6401b083312dc386baa95bd611f83882f95f5 100644
--- a/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CumprodKernelNpu.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include<ATen/NamedTensorUtils.h>
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp b/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp
index a50c9487e52bdeccf3837e8a80a3731fa1ed3138..62118b04432af314622917fa5f785e503a02649d 100644
--- a/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DiagKernelNpu.cpp
@@ -73,8 +73,7 @@ Tensor& diag_out_npu(Tensor& result, const Tensor& self, int64_t diagonal) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
+      self,
       outputSize);
 
   OpPipeWithDefinedOut pipe;
diff --git a/src/aten/src/ATen/native/npu/DivKernelNpu.cpp b/src/aten/src/ATen/native/npu/DivKernelNpu.cpp
index 1c468502484b15042fba417cdb2154d1c8ba303a..d187d19736f19e19b2463e3ac5dd49bafce085ec 100644
--- a/src/aten/src/ATen/native/npu/DivKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DivKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/DotKernelNpu.cpp b/src/aten/src/ATen/native/npu/DotKernelNpu.cpp
index 73fb05c97bfb3efab3d27676ffd2ab1f968ef44b..9f29b4b6b6edb8948b4df5ddeee473065fc0a3dc 100644
--- a/src/aten/src/ATen/native/npu/DotKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DotKernelNpu.cpp
@@ -12,10 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h" 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -34,9 +31,8 @@ Tensor& dot_out_npu(Tensor& result, const Tensor& self, const Tensor& tensor) {
   return result;
 }
 Tensor dot_npu(const Tensor& self, const Tensor& tensor) {
-  // calculate the output size
   SmallVector<int64_t, SIZE> outputSize = dot_npu_output_size(self, tensor);
-  Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   dot_out_npu(result, self, tensor);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp
index 26ececdb639c920f71b82e8c59ee4d69ebb96a44..d7be9f5e3563b818b0a24a9595fc91bcc0cac33c 100644
--- a/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DropoutBackwardKernelNpu.cpp
@@ -13,9 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "ATen/native/npu/utils/OpAdapter.h"
 #include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
 
 namespace at {
 namespace native {
@@ -31,12 +30,8 @@ Tensor dropout_backward_npu(
   TORCH_CHECK(
       mask.scalar_type() == at::ScalarType::Byte,
       "mask should be torch.uint8 dtype");
-  auto outputSize = input_same_output_size(grad_output);
   double retain =  1. - scale;
-  Tensor result = at::empty_with_format(
-      outputSize,
-      grad_output.options(),
-      CalcuOpUtil::get_tensor_npu_format(grad_output));
+  Tensor result = OpPreparation::ApplyTensor(grad_output);
   Tensor prob =
       CalcuOpUtil::CopyScalarToDevice(retain, grad_output.scalar_type());
 
diff --git a/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp
index b82e7728d7eb67e2c91f5c08179f6e378e665cd1..34c41be07f188d2ae1ed3400ebb97b6e1aa6d926 100644
--- a/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp
@@ -13,8 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -53,9 +52,7 @@ Tensor dropout_v2_backward_npu(const Tensor& grad_output, const Tensor& mask, do
   if (maskCopy.scalar_type() == ScalarType::Byte){
     maskCopy = maskCopy.to(ScalarType::Half);
   }
-  auto outputSize = input_same_output_size(grad_output);
-  auto result = at::empty_with_format(
-      outputSize, grad_output.options(), CalcuOpUtil::get_tensor_npu_format(grad_output));
+  auto result = OpPreparation::ApplyTensor(grad_output);
   dropout_v2_backward_out_npu(result, grad_output, maskCopy, p);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp
index 18b7883787277ce8e2ef968be386f41e5f0d44b0..e4a435e77a4c8f9d479ae5a57b6ff43f2fa01eea 100644
--- a/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -46,11 +45,8 @@ tuple <Tensor, Tensor, Tensor> dropout_v2_npu(const Tensor& self, Tensor& seed,
   Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
   Tensor formatCastOfSeed = OpPreparation::CastBackToOriFormat(seed);
   
-  Tensor result = at::empty_with_format(
-      formatCastOfSelf.sizes(), formatCastOfSelf.options(), CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf));
-  Tensor mask = at::empty_with_format(
-      formatCastOfSelf.sizes(), formatCastOfSeed.options(), CalcuOpUtil::get_tensor_npu_format(formatCastOfSelf));
-
+  Tensor result = OpPreparation::ApplyTensor(formatCastOfSelf);
+  Tensor mask = OpPreparation::ApplyTensor(formatCastOfSelf, formatCastOfSeed.options());
   dropout_v2_out_npu(result, mask, formatCastOfSeed, formatCastOfSelf, formatCastOfSeed, p);
   NpuUtils::format_fresh_view(seed, formatCastOfSeed);
   return std::tuple<Tensor, Tensor, Tensor>(result, mask, seed);
diff --git a/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp b/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp
index 3b0e3886b990e627ab2c5a8935f44d731809982f..b25e4f43b8e6d763dd55c76126e6b6ff86317f36 100644
--- a/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/EmbeddingKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "c10/npu/OptionsManager.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/EqKernelNpu.cpp b/src/aten/src/ATen/native/npu/EqKernelNpu.cpp
index 89c567438c2b9cc94816a6aa61b28c229e05edee..becfb6f46da8c924b65a3c8c334ec950d1b37c58 100644
--- a/src/aten/src/ATen/native/npu/EqKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/EqKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp b/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp
index ac1bbf17e73a7ed8730f525e47ff144fa89c6867..a773eee31289a6b306027804c9d3be8a8b432920 100644
--- a/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ExpKernelNpu.cpp
@@ -34,9 +34,7 @@ Tensor& exp_out_npu(Tensor& result, const Tensor& self) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
@@ -51,11 +49,7 @@ Tensor& exp_npu_(Tensor& self) {
 }
 
 Tensor exp_npu(const Tensor& self) {
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   exp_out_npu_nocheck(result, self);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/FillKernelNpu.cpp b/src/aten/src/ATen/native/npu/FillKernelNpu.cpp
index f22bd1028c6071080e67fbefd618608026704a7d..0393a62c8447154335e80f3f9605a74f820882b3 100644
--- a/src/aten/src/ATen/native/npu/FillKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/FillKernelNpu.cpp
@@ -44,12 +44,11 @@ Tensor& fill_out_npu(Tensor& result, Tensor& self, const Tensor& other) {
 Tensor& fills_out_npu(Tensor& result, Tensor& self, Scalar value) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "fills_out_npu", [&]() {
     auto value_converted = value.to<scalar_t>();}); 
-  float scalar = CalcuOpUtil::get_scalar_float_value(value);
   OpCommand cmd;
   cmd.Name("Fills")
       .Input(self)
       .Output(result)
-      .Attr("value", scalar)
+      .Attr("value", value)
       .Run();
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp b/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp
index 38635a9480ce3bdaf3539b237a32f32714e7c904..ab6b6a2c4cb21571603977d5651ea2d42355e4a3 100644
--- a/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/FlipKernelNpu.cpp
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "c10/npu/OptionsManager.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
 
@@ -21,12 +20,7 @@ namespace native {
 using namespace at::native::npu;
 
 Tensor flip_npu(const Tensor& self, IntArrayRef dims){
-    // calculate the output size
-    auto outputSize = input_same_output_size(self);
-    
-    // construct the output tensor of the NPU
-    Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+    Tensor result = OpPreparation::ApplyTensor(self);
     SmallVector<int64_t,N> dimVec = array_to_small_vector(dims);
     if (!c10::npu::OptionsManager::CheckDynamicEnable()) {  
       OpCommand cmd;
diff --git a/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp b/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp
index c0cfee496dfb5b1fb3fb4b56bfd7eb95e7ed3bf2..868f3c08f7ca4b99e40f6bfe510e263dce3c55fe 100644
--- a/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/FloorDivideKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp b/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp
index 07fc34e6975b5d9a81e0ed2799e4ca693f994b28..bfc5ac5f3cf1b0bdcb0d4fcf372f7e0780812411 100644
--- a/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/FloorKernelNpu.cpp
@@ -34,9 +34,7 @@ Tensor& floor_out_npu(Tensor& result, const Tensor& self) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
@@ -51,11 +49,7 @@ Tensor& floor_npu_(Tensor& self) {
 }
 
 Tensor floor_npu(const Tensor& self) {
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   floor_out_npu_nocheck(result, self);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp b/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp
index 1ee4b82bd8c2e11aa06c0cff262c0831413765cf..ed29f29842bae495592daf5b596ccd2dee4d7138 100644
--- a/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/FmodKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -49,9 +48,8 @@ Tensor& fmod_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
   auto outputSize = broadcast_ops_npu_output_size(self, other);
   OpPreparation::CheckOut(
     {self, other}, 
-    result, 
-    CalcuOpUtil::get_tensor_npu_format(self), 
-    self.scalar_type(), 
+    result,
+    self, 
     outputSize);
   
   fmod_out_npu_nocheck(result, self, other);
@@ -65,10 +63,7 @@ Tensor& fmod_out_npu(Tensor& result, const Tensor& self, Scalar other) {
 }
 
 Tensor& fmod_npu_(Tensor& self, Scalar other) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = fmod_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
@@ -81,10 +76,7 @@ Tensor& fmod_npu_(Tensor& self, Scalar other) {
 }
 
 Tensor& fmod_npu_(Tensor& self, const Tensor& other) {
-  SmallVector<Tensor, N> inputs = {self, other};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self, other}, {self}); 
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = fmod_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
@@ -97,26 +89,14 @@ Tensor& fmod_npu_(Tensor& self, const Tensor& other) {
 }
 
 Tensor fmod_npu(const Tensor& self, Scalar other) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   fmod_out_npu_nocheck(result, self, other);
   return result;
 }
 
 Tensor fmod_npu(const Tensor& self, const Tensor& other) {
-  // calculate the output size
   auto outputSize = broadcast_ops_npu_output_size(self, other);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   fmod_out_npu_nocheck(result, self, other);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp b/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp
index 3d6d2dc560ec858f14cfddeb079a327ccbe0a445..6034cb27a981c5d6b592fe3ac179cb61b57f5807 100644
--- a/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GatherKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/GeKernelNpu.cpp b/src/aten/src/ATen/native/npu/GeKernelNpu.cpp
index c0f5189e30ef381ec3401d17eb1361766ccc229e..6169b9de059eb470ed71172330301d88ea77d319 100644
--- a/src/aten/src/ATen/native/npu/GeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GeKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp b/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp
index 77ad4b6e29a9f1b39c0c8030ae99dd03cf83906f..b736e79d6cd0bad59fe3f1d475972249ebc40630 100644
--- a/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GeluKernelNpu.cpp
@@ -11,18 +11,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+
+
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
 using namespace at::native::npu;
 
 Tensor gelu_npu(const Tensor& self) {
-  // calculate the output size
-    auto outputSize = input_same_output_size(self);
-  // construct the output tensor of the NPU
-    Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+    Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
     OpCommand cmd;
     cmd.Name("Gelu")
diff --git a/src/aten/src/ATen/native/npu/GerKernelNpu.cpp b/src/aten/src/ATen/native/npu/GerKernelNpu.cpp
index 7403be29cd0f5eb8c3b7a76dc88561ad050f4223..8651b3e70493d9ddc86a869f2deb45154dde1f05 100644
--- a/src/aten/src/ATen/native/npu/GerKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GerKernelNpu.cpp
@@ -54,8 +54,7 @@ Tensor& ger_out_npu(Tensor& result, const Tensor& self , const Tensor& vec2) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
+      self,
       outputSize);
 
   OpPipeWithDefinedOut pipe;
diff --git a/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp
index 80f7c17f6359ce1ce728da6a30e23d055e4d38a1..4d71349db4e0cb69646d14b2abaef55275ad1b1b 100644
--- a/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GridSampler3dBackwardKernelNpu.cpp
@@ -15,7 +15,6 @@
 // limitations under the License.
 
 #include <c10/npu/OptionsManager.h>
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp
index 10e53e0a3c57d8aa3e0acbb7099c64a954019261..75461018566a1c01580f3acc6804fe65e8921ce8 100644
--- a/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GridSampler3dKernelNpu.cpp
@@ -15,7 +15,6 @@
 // limitations under the License.
 
 #include <c10/npu/OptionsManager.h>
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
index aa9c7394bb5540ea513008dd866954b03c6f90db..b9d0bfe9247169441b616fd38a1cb451cf6aa41b 100644
--- a/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GtKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp
index 465928a0dd3bb4268310749e7abd19eb5ecce275..070af9fe52fd674968172abb510b8fd5676b7932 100644
--- a/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/HardsigmoidKernelNpu.cpp
@@ -39,10 +39,7 @@ Tensor hardsigmoid_npu(const Tensor& self) {
 }
 
 Tensor& hardsigmoid_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = hardsigmoid_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp
index 20cd8fdb1bf9389350b410f12307d8b451f4bd7a..786e53c2dc5edc29e6b6c09bce3cad0852b62dc2 100644
--- a/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/HardtanhBackwardKernelNpu.cpp
@@ -27,15 +27,13 @@ Tensor& hardtanh_backward_out_npu(
     const Tensor& self,
     Scalar min_val,
     Scalar max_val) {
-  float max_value = CalcuOpUtil::get_scalar_float_value(max_val);
-  float min_value = CalcuOpUtil::get_scalar_float_value(min_val);
   OpCommand cmd;
   cmd.Name("HardtanhGrad")
       .Input(self)
       .Input(grad_output)
       .Output(grad_input)
-      .Attr("max_val", max_value)
-      .Attr("min_val", min_value)
+      .Attr("max_val", max_val)
+      .Attr("min_val", min_val)
       .Run();
 
   return grad_input;
@@ -46,13 +44,7 @@ Tensor hardtanh_backward_npu(
     const Tensor& self,
     Scalar min_val,
     Scalar max_val) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor grad_input = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor grad_input = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   hardtanh_backward_out_npu(grad_input, grad_output, self, min_val, max_val);
 
diff --git a/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp
index 90103032b7d18400320eef85d0a6220e58b3cc27..6686c026c6a54380b459be8b2d5a24650fbae327 100644
--- a/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/HardtanhKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -38,24 +37,13 @@ Tensor& hardtanh_out_npu(
 }
 
 Tensor hardtanh_npu(const Tensor& self, Scalar min, Scalar max) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   hardtanh_out_npu(result, self, min, max);
-      
   return result;
 }
 
 Tensor& hardtanh_npu_(Tensor& self, Scalar min, Scalar max) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = hardtanh_out_npu(contiguousSelf, contiguousSelf, min, max);
diff --git a/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp b/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp
index 5434ec7cddf335301d4f54298306d81a3b9206cf..5b3825428b15a9244d97f9005a1f1c62f4ef5911 100644
--- a/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IfmrKernelNpu.cpp
@@ -14,9 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp
index bfa2f335330d583bdba8246e14f1d6ad7ebaf02b..c0a96311251e930e14be19ea74610e6a8f22671a 100644
--- a/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp
@@ -71,8 +71,7 @@ Tensor& im2col_backward_out_npu(
   OpPreparation::CheckOut(
       {grad_output},
       grad_input,
-      CalcuOpUtil::get_tensor_npu_format(grad_output),
-      grad_output.scalar_type(),
+      grad_output,
       outputSize);
 
   OpPipeWithDefinedOut pipe;
diff --git a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp
index eaa3d74bf562e8644fac716b996e9f7cb48e7da0..d658eec1deb854ba4fced8595f3c8e9648b1bcef 100644
--- a/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/Im2colKernelNpu.cpp
@@ -124,8 +124,7 @@ Tensor& im2col_out_npu(Tensor& result, const Tensor &self, IntArrayRef kernel_si
   OpPreparation::CheckOut(
     {self},
     result,
-    CalcuOpUtil::get_tensor_npu_format(self),
-    self.scalar_type(),
+    self,
     image_to_col_npu_output_size(self, kernel_size, stride, dilation, padding));
 
   OpPipeWithDefinedOut pipe;
@@ -139,10 +138,7 @@ Tensor im2col_npu(const Tensor &self, IntArrayRef kernel_size, IntArrayRef dilat
   // calculate the output size
   auto outputSize =
       image_to_col_npu_output_size(self, kernel_size, stride, dilation, padding);
-
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   im2col_out_npu(result, self, kernel_size, dilation, padding, stride);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp
index 93bb88780acda59938c2b9ba211e153d2f81404b..803b344e2e71911234c88ab2b28758c058ef559f 100644
--- a/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IndexAddKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include<ATen/NamedTensorUtils.h>
 
 namespace at {
 namespace native {
@@ -51,10 +52,7 @@ Tensor& index_add_npu_(
     int64_t dim,
     const Tensor& index,
     const Tensor& source) {
-  SmallVector<Tensor, N> inputs = {self, index, source};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self, index, source}, {self});
   if (!NpuUtils::check_match(&self)) {
       Tensor contiguousSelf = NpuUtils::format_contiguous(self);
       Tensor result = index_add_out_npu(contiguousSelf, contiguousSelf, dim, index, source);
diff --git a/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp
index a5ef53dec31dd29bb3469517eef25b5b9682efe9..0a9db345f5160c6e677bea6a41a6c4a69cd44ced 100644
--- a/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IndexFillDKernelNpu.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include <vector>
 
 namespace at{
diff --git a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
index 04ea6edcb722b8b0ef06f5727d4a084f4ca126cc..9cbbf8f8416d6117e5b701cab3bb973e1c8b3fc8 100644
--- a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -37,10 +38,9 @@ Tensor& index_put_nocheck(
     }
   }
 
-  Tensor masksTensor = CalcuOpUtil::copy_tensor_host_to_device(
+  auto masksTensor = CalcuOpUtil::copy_tensor_host_to_device(
       from_blob(masks.data(), {masks.size()}, dtype(ScalarType::Long)));
 
-
   OpCommand cmd;
   cmd.Name("IndexPut")
       .Input(self)
@@ -78,10 +78,7 @@ Tensor& _index_put_impl_npu_(
     const Tensor& value,
     const bool accumulate,
     const bool unsafe) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   OpPreparation::CastBackToOriFormat(self);
 
   Tensor valueCopy = value;
diff --git a/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp
index 9cd54424d690c3c5132cf4bb89ea1c2d5348fa42..277ed1c8a2f2ca00b8ba7409ed30990a4e4cb4a2 100644
--- a/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IndexSelectKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "c10/npu/OptionsManager.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp
index da7b15df158c77b72dc70ee5d9f90e72f627e3a5..864a096f73936ea35bd14e979ff85f69e917b12b 100644
--- a/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IndexingKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -56,9 +55,7 @@ Tensor indexing_npu(
     outputSize.emplace_back((end[i] + strides[i] - 1 - begin[i]) / strides[i]);
   }
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   indexing_out_npu(result, self, begin, end, strides);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/InstanceNormKernelNpu.cpp b/src/aten/src/ATen/native/npu/InstanceNormKernelNpu.cpp
deleted file mode 100644
index f61cc8982c6733b1837a9673deda3eb9f9741c6c..0000000000000000000000000000000000000000
--- a/src/aten/src/ATen/native/npu/InstanceNormKernelNpu.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <c10/npu/NPUCachingAllocator.h>
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> instance_norm_npu_input(const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> instance_norm_npu_output(const Tensor& result) {
-  return CalcuOpUtil::create_npu_output_tensor_desc({result});
-}
-
-SmallVector<NPUAttrDesc, N> instance_norm_npu_attr(bool use_input_stats, double momentum, double eps) {
-  NPUAttrDesc npuAttrStats = NPUAttrDesc("use_input_stats", use_input_stats);
-  NPUAttrDesc npuAttrMomentum = NPUAttrDesc("momentum", static_cast<float>(momentum));
-  NPUAttrDesc npuAttrEpsilon = NPUAttrDesc("eps", static_cast<float>(eps));
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrStats, npuAttrMomentum, npuAttrEpsilon};
-  return attrs;
-}
-
-Tensor& instance_norm_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& weight,
-    const Tensor& bias,
-    const Tensor& running_mean,
-    const Tensor& running_var,
-    bool use_input_stats,
-    double momentum,
-    double eps) {
-  // constructs the input and output NPUTensorDesc
-  auto inputs = instance_norm_npu_input(
-      {self, weight, bias, running_mean, running_var});
-  auto outputs = instance_norm_npu_output(result);
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = instance_norm_npu_attr(use_input_stats, momentum, eps);
-
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("InstanceNorm", inputs, outputs, attrs);
-
-  return result;
-}
-
-Tensor instance_norm_npu(
-    const Tensor& self,
-    const Tensor& weight,
-    const Tensor& bias,
-    const Tensor& running_mean,
-    const Tensor& running_var,
-    bool use_input_stats,
-    double momentum,
-    double eps,
-    bool cudnn_enabled) {
-  TORCH_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()), 
-              "Expected running_mean and running_var to be defined when use_input_stats is false");
-  Tensor result = at::empty_with_format(self.sizes(), self.options(), 
-                                        CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
-  instance_norm_out_npu(
-      result,
-      self,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      use_input_stats,
-      momentum,
-      eps);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at
diff --git a/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp b/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp
index b00ab6091b2e4c9267a9446d93d6644b0dfec23b..c25c31599b68bcf1ea88fc5227561448cf10f19f 100644
--- a/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -36,8 +35,7 @@ Tensor& inverse_out_npu(
 }
 
 Tensor inverse_npu(const Tensor& self) {
-  Tensor result = at::empty_with_format(
-      self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self);
 
   inverse_out_npu(result, self);
 
diff --git a/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp
index 41e5f286e7983bb681e62d57a0eeb3c11e0ed8a5..41ed93cd7185c012e50ecef2c67a54cc0c2ccd71 100644
--- a/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/L1LossBackwardKernelNpu.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp b/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp
index f017aa6d23cc0162f5d7d761625ca5c45c0a6487..0e4ca4176dc8163513eb7a99f2b99fda1d35c823 100644
--- a/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/L1lossKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/LeKernelNpu.cpp b/src/aten/src/ATen/native/npu/LeKernelNpu.cpp
index a4e925d112cbe025675502e570db3bc804dff9a2..88ad478bd8b658c5f503e481e51b6c5823a430d3 100644
--- a/src/aten/src/ATen/native/npu/LeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LeKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp
index 6625add2c27f85f968a79f645028e4e6b9a1c334..7ae583aea73d60518099dd32c479071341835a7e 100644
--- a/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LeakyReluBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -27,13 +26,12 @@ Tensor leaky_relu_backward_out_npu(
     const Tensor& self,
     Scalar negval,
     bool is_result) {
-  float negvalValue = CalcuOpUtil::get_scalar_float_value(negval);
   OpCommand cmd;
   cmd.Name("LeakyReluGrad")
       .Input(grad_output)
       .Input(self)
       .Output(result)
-      .Attr("negative_slope", negvalValue)
+      .Attr("negative_slope", negval)
       .Run();
   return result;
 }
@@ -43,14 +41,7 @@ Tensor leaky_relu_backward_npu(
     const Tensor& self,
     Scalar negval,
     bool is_result) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   leaky_relu_backward_out_npu(result, grad_output, self, negval, is_result);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp b/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp
index 4e04514bcc626259cb4079ada721f64ddf5a191d..ff1f6ed40fec4864d87982c3fbfbe4f70045e543 100644
--- a/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LeakyReluKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -33,23 +32,14 @@ Tensor& leaky_relu_out_npu(Tensor& result, const Tensor& self, Scalar negval) {
 }
 
 Tensor leaky_relu_npu(const Tensor& self, Scalar negval) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   leaky_relu_out_npu(result, self, negval);
   return result;
 }
 
 Tensor& leaky_relu_npu_(Tensor& self, Scalar neg_val) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = leaky_relu_out_npu(contiguousSelf, contiguousSelf, neg_val);
diff --git a/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp
index a9fb4097c339451e775e960535ffedecbd4f15f1..13ca2ae5ac76c4b465c5c88533b2b48e24bb28c7 100644
--- a/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LinspaceKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/Log10KernelNpu.cpp b/src/aten/src/ATen/native/npu/Log10KernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fa73df052424dc54f6d16b0fe709f3309dae751
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/Log10KernelNpu.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& log10_out_npu_nocheck(Tensor& result, const Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("Log")
+      .Input(self)
+      .Output(result)
+      .Attr("base", (float)10.0)
+      .Attr("scale", (float)1.0)
+      .Attr("shift", (float)0.0)
+      .Run();
+
+  return result;
+}
+
+Tensor& log10_out_npu(Tensor& result, const Tensor& self) {
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self);
+
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self](Tensor& result){log10_out_npu_nocheck(result, self);})
+   .Call(result);
+}
+
+Tensor log10_npu(const Tensor& self) {
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
+
+  // calculate the output result of the NPU
+  log10_out_npu_nocheck(result, self);
+
+  return result;
+}
+
+Tensor& log10_npu_(Tensor& self) {
+  log10_out_npu(self, self);
+
+  return self;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp b/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp
index 8c44547f7b53a7d76b73371fb6ee87f553d5cd78..57e9a2b45ff24774034142037d13d56df4602389 100644
--- a/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/Log1pKernelNpu.cpp
@@ -29,13 +29,7 @@ Tensor& log1p_out_npu(Tensor& result, const Tensor& self){
 }
 
 Tensor log1p_npu(const Tensor& self) {
-  //calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  //construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(outputSize,
-    self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   //calculate the output result of the NPU
   log1p_out_npu(result, self);
 
@@ -43,9 +37,7 @@ Tensor log1p_npu(const Tensor& self) {
 }
 
 Tensor& log1p_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self}, {self});
 
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
diff --git a/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp b/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp
index 6eff7033e140e362b1cef373807fad41a6911656..11e83eb11734a933038369ddd9f398d4ee59ae5a 100644
--- a/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/Log2KernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -35,24 +34,13 @@ Tensor& log2_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor log2_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result =  OpPreparation::ApplyTensor(self);
   log2_out_npu(result, self);
-
   return result;
 }
 
 Tensor& log2_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = log2_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/LogKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogKernelNpu.cpp
index 1547cc6314cb696b920573176f56c4405c4566e2..89409d80f542257f85c859cf35e24fea1806a7e9 100644
--- a/src/aten/src/ATen/native/npu/LogKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogKernelNpu.cpp
@@ -21,16 +21,13 @@ namespace native {
 using namespace at::native::npu;
 
 Tensor& log_out_npu_nocheck(Tensor& result, const Tensor& self) {
-  float baseValue = CalcuOpUtil::get_scalar_float_value(-1);
-  float scaleValue = CalcuOpUtil::get_scalar_float_value(1);
-  float shiftValue = CalcuOpUtil::get_scalar_float_value(0);
   OpCommand cmd;
   cmd.Name("Log")
       .Input(self)
       .Output(result)
-      .Attr("base", baseValue)
-      .Attr("scale", scaleValue)
-      .Attr("shift", shiftValue)
+      .Attr("base", (float)-1)
+      .Attr("scale", (float)1)
+      .Attr("shift", (float)0)
       .Run();
 
   return result;
@@ -40,9 +37,7 @@ Tensor& log_out_npu(Tensor& result, const Tensor& self) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
diff --git a/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp
index 391feef2dc201f06fc6fc48f149057d431c6a837..71373bffaf7a09bac16c63561677d8e025d56e73 100644
--- a/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogSigmoidKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -33,11 +32,8 @@ tuple<Tensor&, Tensor&> log_sigmoid_forward_out_npu(
 }
 
 tuple<Tensor, Tensor> log_sigmoid_forward_npu(const Tensor& self) {
-  // construct the output tensor of the NPU
-  Tensor output = at::empty_with_format(
-        self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor output = OpPreparation::ApplyTensor(self);
   Tensor buffer = at::empty({0}, self.options());
-
   // calculate the output result of the NPU
   log_sigmoid_forward_out_npu(output, buffer, self);
   return tuple<Tensor, Tensor>(output, buffer);
diff --git a/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp
index e52d27c904414c876be3f91fc60d2bd52ac3570e..95b65df6c9d9f425906e766a79c4387775e3eb54 100644
--- a/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogSoftmaxBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -27,16 +26,8 @@ Tensor _log_softmax_backward_npu(
     int64_t dim,
     const Tensor& self) {
   SmallVector<int64_t, N> dimList = {dim};
-  // calculate the output size
-  auto outputSize = input_same_output_size(grad_output);
+  Tensor grad_input = OpPreparation::ApplyTensor(grad_output);
 
-  // construct the output tensor of the NPU
-  Tensor grad_input = at::empty_with_format(
-      outputSize,
-      grad_output.options(),
-      CalcuOpUtil::get_tensor_npu_format(grad_output));
-
-  // calculate the output result of the NPU
   OpCommand cmd;
   cmd.Name("LogSoftmaxGrad")
       .Input(grad_output)
diff --git a/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp
index 2a41f52ee5524b4a05eeddc892e0bb387699551f..94c653e3c0e7b40b8dc7210a6c9970199c520894 100644
--- a/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogSoftmaxKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include<ATen/NamedTensorUtils.h>
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp
index fecc1154825906dff0777edbcc07dac7c06215ef..24dc1c437cb2af7220c3d7ef73fa39787c119b3f 100644
--- a/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogSpaceKernelNpu.cpp
@@ -40,8 +40,6 @@ Tensor& logspace_out_npu(
     inputs = at::arange(0, steps, at::device(at::kNPU).dtype(at::kFloat));
   }
 
-  float startAttr = CalcuOpUtil::get_scalar_float_value(start);
-  float endAttr = CalcuOpUtil::get_scalar_float_value(end);
   int64_t dtype = 0;
   if (result.scalar_type() == at::ScalarType::Half) {
     dtype = 0;
@@ -55,8 +53,8 @@ Tensor& logspace_out_npu(
   cmd.Name("LogSpaceD")
       .Input(inputs)
       .Output(result)
-      .Attr("start", startAttr)
-      .Attr("end", endAttr)
+      .Attr("start", start)
+      .Attr("end", end)
       .Attr("steps", steps)
       .Attr("base", static_cast<float>(base))
       .Attr("dtype", dtype)
diff --git a/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp
index 65d807c74478e6a4273d8fbef70add67a0feb489..c4ad2a61518550b21135a3a91a39559886364d9e 100644
--- a/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogicalNotKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -49,15 +48,8 @@ Tensor logical_not_npu(const Tensor& self) {
 }
 
 Tensor& logical_not_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
-  Tensor result = at::empty_with_format(
-      self.sizes(),
-      self.options().dtype(ScalarType::Byte),
-      CalcuOpUtil::get_tensor_npu_format(self));
-
+  OpPreparation::CheckMemory({self}, {self});
+  Tensor result = OpPreparation::ApplyTensor(self, self.options().dtype(ScalarType::Byte));
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     logical_not_out_npu(result, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp
index 26e8490ac8c54ae5df0e945951fddc120b861ab6..c158f0309e2672a911e3144e7e302fbce01bc82b 100644
--- a/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LstmKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/LtKernelNpu.cpp b/src/aten/src/ATen/native/npu/LtKernelNpu.cpp
index fae3afb2fc5a53a52687c9ba37203800ac56170a..8e2ab7095eea9d592b3982d2c34c2d9f945af094 100644
--- a/src/aten/src/ATen/native/npu/LtKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LtKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp
index 36f73d8c40834e9a7596f03ed18f12a6ec58cefa..e27c68e940dfee6af9651b1062b488133074a61c 100644
--- a/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MaskedFillKernelNpu.cpp
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -80,10 +80,7 @@ Tensor& masked_fill_out_npu(Tensor& result, const Tensor& self, const Tensor& ma
 }
 
 Tensor& masked_fill_npu_(Tensor& self, const Tensor& mask, const Tensor& value) {
-  SmallVector<Tensor, N> inputs = {self, mask, value};
-  SmallVector<Tensor, N> outputs = {self};
-
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, mask, value}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = masked_fill_out_npu(contiguousSelf, contiguousSelf, mask, value);
@@ -95,10 +92,7 @@ Tensor& masked_fill_npu_(Tensor& self, const Tensor& mask, const Tensor& value)
 }
 
 Tensor& masked_fill_npu_(Tensor& self, const Tensor& mask, Scalar value) {
-  SmallVector<Tensor, N> inputs = {self, mask};
-  SmallVector<Tensor, N> outputs = {self};
-
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, mask}, {self});
 
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
diff --git a/src/aten/src/ATen/native/npu/MaskedFillRangeKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaskedFillRangeKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..269c77c9f904e1d8200ab13b6a8873e8cc14082a
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/MaskedFillRangeKernelNpu.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+void mask_fill_range_check(
+    const Tensor& self,
+    const Tensor& start,
+    const Tensor& end,
+    const Tensor& value,
+    int64_t axis){
+  int64_t x_dim = self.dim();
+  int64_t min = -x_dim;
+  int64_t max = x_dim - 1;
+  TORCH_CHECK(
+      !(axis < min || axis > max),
+      "axis overfloaw the range, expected in range [",
+      -x_dim,
+      " ",
+      x_dim - 1,
+      "] ");
+  TORCH_CHECK(
+      start.ndimension() == 2 && start.sizes() == end.sizes(),
+      "Expected noempty 2D start tensor and start' sizes() should be equal end's sizes() ");
+  TORCH_CHECK(
+      start.size(0) == value.size(0),
+      "Expected value.length equal start loop num ");
+  TORCH_CHECK(
+      self.scalar_type() == value.scalar_type(),
+      "value dtype should be equal self dtype !, but value dtype is ",
+      value.scalar_type(),
+      " and self dtype is ",
+      self.scalar_type());
+}
+
+Tensor masked_fill_range_npu(
+    const Tensor& self,
+    const Tensor& start,
+    const Tensor& end,
+    const Tensor& value,
+    int64_t axis){
+  mask_fill_range_check(self, start, end, value, axis);
+  Tensor result = OpPreparation::ApplyTensor(self);
+  OpCommand cmd;
+  cmd.Name("MaskedFillRange")
+      .Input(self)
+      .Input(start)
+      .Input(end)
+      .Input(value)
+      .Output(result)
+      .Attr("axis", axis)
+      .Run();
+  return result;
+}
+
+}
+}
diff --git a/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp
index 4bf48e9da064ba44dca448b7410ea627670f3d1b..1f3c8cd185110e7a3dce8ce8bb9f9c33315ef584 100644
--- a/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MaxKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp b/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp
index b887f5efca7ba98cc8265def17043ea765c76989..d05fc30726260ed549e35d9274ff5913593be062 100644
--- a/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp b/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp
index 0bd31892938c51d493a11d2af18d5292c78cb6c3..7c9da5ccce796b29a9fd7bd216a16b3745f27985 100644
--- a/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MeanKernelNpu.cpp
@@ -14,9 +14,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "c10/npu/OptionsManager.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp b/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp
index 7f77c92962d8b03407f2c17053060a2d885c7d08..3131a6148ed77f2b9b3016d40b58689726fb79fa 100644
--- a/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MedianKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
index 9ec7d29a0aa323b4b94b945b0cb414cba94005e7..821424393afeddddd346d448c3656a6d6b1c671c 100644
--- a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
@@ -15,8 +15,8 @@
 // limitations under the License.
 
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp
index f821d7a4f2d595b8657be2ba23ad116a34e29723..6b7e9358300bbd04336199d685b56de7a358a0f9 100644
--- a/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MseLossBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -55,17 +54,12 @@ Tensor mse_loss_backward_npu(
   const Tensor& self,
   const Tensor& target,
   int64_t reduction) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
   auto grad_out = grad_output.contiguous();
   if (grad_out.dim() == 0) {
     grad_out.view(1);
   }
 
-  // construct the output tensor of the NPU
-  Tensor grad_input = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor grad_input = OpPreparation::ApplyTensor(self);
   
   mse_loss_backward_out_npu(
     grad_input,
diff --git a/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp b/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp
index d73339bab795a4643fc57db19650326c4f709f78..8669010346c89e2a7521ab0f0fce8b53629e207f 100644
--- a/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MseLossKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -66,9 +65,8 @@ Tensor& mse_loss_out_npu(
   }  
   OpPreparation::CheckOut(
     {self, target}, 
-    result, 
-    CalcuOpUtil::get_tensor_npu_format(self), 
-    self.scalar_type(), 
+    result,
+    self, 
     outputSize);
   mse_loss_out_npu_nocheck(result, self, target, reduction);
   return result;
@@ -85,9 +83,7 @@ Tensor mse_loss_npu(
   }
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self,outputSize);
   // calculate the output result of the NPU
   mse_loss_out_npu_nocheck(result, self, target, reduction);
 
diff --git a/src/aten/src/ATen/native/npu/MulKernelNpu.cpp b/src/aten/src/ATen/native/npu/MulKernelNpu.cpp
index 8f074001652509301be20560e1fb8ee96fd6791b..c009de29f81b8bec1961ca4c6a5fbd0183820de3 100644
--- a/src/aten/src/ATen/native/npu/MulKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MulKernelNpu.cpp
@@ -15,8 +15,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include <c10/npu/OptionsManager.h>
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/MvKernelNpu.cpp b/src/aten/src/ATen/native/npu/MvKernelNpu.cpp
index 3ee01f50cc04525ca90bd6f94eb0e1063a9393e2..3a09b43910915830761955d4c3074d15b1965120 100644
--- a/src/aten/src/ATen/native/npu/MvKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MvKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "ATen/native/npu/common/InnerNpuNativeFunction.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/NeKernelNpu.cpp b/src/aten/src/ATen/native/npu/NeKernelNpu.cpp
index b42a3fa5acb85a80c8795a64802ec932e1cbfada..4e4c377b7a1ca69228f7e8fa50b401477e7d87a2 100644
--- a/src/aten/src/ATen/native/npu/NeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NeKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -125,9 +126,7 @@ Tensor ne_npu(const Tensor& self, Scalar other) {
 Tensor& ne_npu_(Tensor& self, const Tensor& other) {
   OpPreparation::CastBackToOriFormat(self);
   OpPreparation::CastBackToOriFormat(other);
-  SmallVector<Tensor, N> inputs = {self, other};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, other}, {self});
 
   Tensor result = at::empty_with_format(
     self.sizes(),
@@ -149,10 +148,7 @@ Tensor& ne_npu_(Tensor& self, const Tensor& other) {
 
 Tensor& ne_npu_(Tensor& self, Scalar other) {
   OpPreparation::CastBackToOriFormat(self);
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   Tensor result = at::empty_with_format(
     self.sizes(),
     self.options().dtype(ScalarType::Byte),
diff --git a/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp b/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp
index 3144646899386b8f2f48baa0c85c6af1b0f9a627..393970fc1d5329307541283cd48aee52599b98fa 100644
--- a/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NonzeroKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
index 005e993eb867b7d06a238ffc78ed856553aa757b..8308e4763aa4923b358d9ac4f23594b85cc3a159 100644
--- a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "climits"
    
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp
index b0722df7324e22ad2c84470949c27622d74be221..158b7c74a1de4e0ecc9cac144b37ec951bf5a9a7 100644
--- a/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NormalKernelNpu.cpp
@@ -131,14 +131,7 @@ Tensor normal_npu(
     const Tensor& mean, 
     double std, 
     Generator* generator) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(mean);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, mean.options(), CalcuOpUtil::get_tensor_npu_format(mean));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(mean);
   normal_out_npu(result, mean, std, generator);
 
   return result;
@@ -148,14 +141,7 @@ Tensor normal_npu(
     double mean, 
     const Tensor& std, 
     Generator* generator) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(std);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, std.options(), CalcuOpUtil::get_tensor_npu_format(std));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(std);
   normal_out_npu(result, mean, std, generator);
 
   return result;
@@ -165,14 +151,7 @@ Tensor normal_npu(
     const Tensor& mean, 
     const Tensor& std, 
     Generator* generator) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(mean);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, mean.options(), CalcuOpUtil::get_tensor_npu_format(mean));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(mean);
   normal_out_npu(result, mean, std, generator);
 
   return result;
@@ -199,10 +178,7 @@ Tensor& normal_npu_(
     double mean,
     double std,
     Generator* generator) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = normal_out_npu(contiguousSelf, mean, std, contiguousSelf.sizes(), generator);
diff --git a/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp b/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp
index 35c98020833dc3abc6e5bb9203c7c9643d234b64..d92440dec002580786506999774f36fdfb2464aa 100644
--- a/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp
@@ -56,10 +56,10 @@ Tensor one_hot_npu1(const Tensor& self, int64_t num_classes) {
   auto outputSize = array_to_small_vector(self.sizes());
   outputSize.emplace_back(depth);
 
-  Tensor result = at::empty_with_format(
+  Tensor result = OpPreparation::ApplyTensor(
       outputSize,
       self.options().dtype(ScalarType::Int),
-      CalcuOpUtil::get_tensor_npu_format(self));
+      self);
 
   SmallVector<int64_t, N> depthList = {depth};
   
diff --git a/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp b/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp
index f971c598365f6312220337ba3c23cb8971257bff..7f56a61f72d863987c00f653829890d9ceca3d31 100644
--- a/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/OnesLikeKernelNpu.cpp
@@ -29,13 +29,8 @@ Tensor ones_like_npu(
     auto result = at::empty_like(self, options, optional_memory_format);
     return result.fill_(1.);
   }
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, options, CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self, options);
   // calculate the output result of the NPUc
   return result.one_();
 }
diff --git a/src/aten/src/ATen/native/npu/PadKernelNpu.cpp b/src/aten/src/ATen/native/npu/PadKernelNpu.cpp
index 43dacfa066681c6126d7966f2c256449ce253567..d2d338f12527d66c3a3dd2d6eb413621981dbee3 100644
--- a/src/aten/src/ATen/native/npu/PadKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PadKernelNpu.cpp
@@ -33,19 +33,13 @@ Tensor& pad_out_npu(
       .Input(paddingsVector)
       .Output(output)
       .Run();
-
   return output;
 }
 
 Tensor pad_npu(const Tensor& input, IntArrayRef paddings) {
-  // calculate the output size
   auto outputSize = pad_npu_output_size(input, paddings);
-  
-  // construct the output tensor of the NPU
-  Tensor output = at::empty_with_format(outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input));
-
+  Tensor output = OpPreparation::ApplyTensor(input, outputSize);
   pad_out_npu(output, input, paddings);
-
   return output;
 }
 
diff --git a/src/aten/src/ATen/native/npu/PowKernelNpu.cpp b/src/aten/src/ATen/native/npu/PowKernelNpu.cpp
index af4d14f31140df71ed31cb4cc97c6bdab8c3503d..01b31ebbc24a7be53a98c65a0280b607853c3c78 100644
--- a/src/aten/src/ATen/native/npu/PowKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PowKernelNpu.cpp
@@ -57,28 +57,13 @@ Tensor& pow_out_npu(Tensor& result, Scalar self, const Tensor& exp) {
 Tensor pow_npu(const Tensor& self, const Tensor& exp) {
   // calculate the output size
   auto outputSize = broadcast_ops_npu_output_size(self, exp);
-  
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-         outputSize,    
-         self.options(), 
-         CalcuOpUtil::get_tensor_npu_format(self));
-  
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   pow_out_npu(result, self, exp);
   return result;
 }
 
 Tensor pow_npu(const Tensor& self, Scalar exp) {
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-         outputSize, 
-         self.options(), 
-         CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   pow_out_npu(result, self, exp);
   return result;
 }
@@ -95,10 +80,7 @@ Tensor pow_npu(Scalar self, const Tensor& exp) {
 }
 
 Tensor& pow_npu_(Tensor& self, const Tensor& exp) {
-  SmallVector<Tensor, N> inputs = {self, exp};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self, exp}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     pow_out_npu(contiguousSelf, contiguousSelf, exp);
@@ -111,10 +93,7 @@ Tensor& pow_npu_(Tensor& self, const Tensor& exp) {
 }
 
 Tensor& pow_npu_(Tensor& self, Scalar exp) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     pow_out_npu(contiguousSelf, contiguousSelf, exp);
diff --git a/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp
index c3093b73b298e8d337e2f78f8960b071ec2f81da..14341ed281974b40ec6bc89fc90a5e06c251b1b5 100644
--- a/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PreluBackwardKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -41,14 +40,9 @@ tuple<Tensor, Tensor> prelu_backward_npu(
     const Tensor& grad_output, 
     const Tensor& self, 
     const Tensor& weight) {
-  // calculate the output size
-  auto outputSizes1 = input_same_output_size(self);
-  auto outputSizes2 = input_same_output_size(weight);  
   // construct the output tensor of the NPU
-  Tensor grad_input = at::empty_with_format(
-    outputSizes1, self.options(), CalcuOpUtil::get_tensor_npu_format(self));  
-  Tensor grad_weight = at::empty_with_format(
-    outputSizes2, weight.options(), CalcuOpUtil::get_tensor_npu_format(weight));
+  Tensor grad_input = OpPreparation::ApplyTensor(self);
+  Tensor grad_weight = OpPreparation::ApplyTensor(weight);
   // calculate the output result of the NPU
   prelu_backward_out_npu(grad_input, grad_weight, grad_output, self, weight);
   return std::tie<Tensor, Tensor>(grad_input, grad_weight);
diff --git a/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp b/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp
index ff9afacad160cb5dace14df3a1ae813cd57b0fb8..52b248ae36364df35ba901f8a936b037b7d6376f 100644
--- a/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -25,8 +24,7 @@ Tensor prelu_npu(const Tensor& self, const Tensor& weight_) {
 
   // calculate the output size
   auto outputSize = input_same_output_size(self);
-  Tensor result = at::empty_with_format(
-  outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input));
+  Tensor result = OpPreparation::ApplyTensor(input, outputSize);
   
   OpCommand cmd;
   cmd.Name("PRelu")
diff --git a/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp b/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp
index fd45dc14b60d3fdfff6661bea6c26880b9b85d03..5d256bcc7d52a9cdece25eaccdc343a323e75638 100644
--- a/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ProdKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -73,14 +74,18 @@ Tensor& prod_out_npu(
   // fp16 transform：fp32 for precise
   if (self.scalar_type() == ScalarType::Half) {
     Tensor result_tmp  = prod_npu(self, dim, keepdim, dtype);
-    OpPreparation::CheckOut({result_tmp}, result, result_tmp);
+    OpPreparation::CheckOut(
+        {result_tmp}, 
+        result, 
+        ACL_FORMAT_ND, 
+        result_tmp.scalar_type(), 
+        result_tmp.sizes());
     result.copy_(result_tmp);
     return result;
   } else {
     auto outputSize = prod_npu_output_size(self, dim, keepdim);
     ScalarType dstType = dtype.has_value() ? dtype.value() : self.scalar_type();
-    int64_t npu_format = calculate_prod_output_format(self, outputSize);
-    OpPreparation::CheckOut({self}, result, npu_format, dstType, outputSize);
+    OpPreparation::CheckOut({self}, result, ACL_FORMAT_ND, dstType, outputSize);
 
     prod_out_npu_nocheck(result, self, {dim}, keepdim, dtype);
     return result;
diff --git a/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp b/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp
index 4621b5e05fdd4c7966e74b45cbf7de4b01311e5b..2875adc7aff695009560413b3f9e5a38e7cfb3ea 100644
--- a/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -25,12 +24,8 @@ Tensor ptiou_npu(
     const Tensor& bboxes,
     const Tensor& gtboxes,
     int64_t mode) {
-  // calculate the output size
   auto outputSize = {gtboxes.size(0), bboxes.size(0)};
-
-  // construct the output tensor of the NPU
-  Tensor overlap = at::empty_with_format(outputSize, bboxes.options(), CalcuOpUtil::get_tensor_npu_format(bboxes));
-
+  Tensor overlap = OpPreparation::ApplyTensor(bboxes, outputSize);
   string modeStr = "iou";
   if (mode == 1) {
     modeStr = "iof";
diff --git a/src/aten/src/ATen/native/npu/QrKernelNpu.cpp b/src/aten/src/ATen/native/npu/QrKernelNpu.cpp
index 141a8faf4dabdf387fe2127c58371839b20ebc6e..a1212802cff3b79915fd17c36ea4e7eac94ebfca 100644
--- a/src/aten/src/ATen/native/npu/QrKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/QrKernelNpu.cpp
@@ -76,14 +76,12 @@ std::tuple<Tensor&, Tensor&> qr_out_npu(
  OpPreparation::CheckOut(
      {self},
      Q,
-     CalcuOpUtil::get_tensor_npu_format(self),
-     self.scalar_type(),
+     self,
      std::get<0>(sizes));
   OpPreparation::CheckOut(
      {self},
      R,
-     CalcuOpUtil::get_tensor_npu_format(self),
-     self.scalar_type(),
+     self,
      std::get<1>(sizes));
   return qr_out_npu_nocheck(Q, R, self, some);
 }
diff --git a/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp b/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp
index ec17afdd76204a2f031647ffdd656e8cc85af450..4e05a90658c384cc29c4a63c2cb2358b30bb4957 100644
--- a/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RandomChoiceWithMaskKernelNpu.cpp
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -33,13 +33,8 @@ std::tuple<Tensor, Tensor> random_choice_with_mask_npu(
       self.dim());
   TORCH_CHECK(count > 0, "The count must greater than 0, but get", count);
 
-  Tensor result = at::empty_with_format(
-      {count, self.dim()},
-      self.options().dtype(kInt),
-      CalcuOpUtil::get_tensor_npu_format(self));
-  Tensor mask = at::empty_with_format(
-      {count}, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor({count, self.dim()}, self.options().dtype(kInt), self);
+  Tensor mask = OpPreparation::ApplyTensor(self, {count});
   OpCommand cmd;
   cmd.Name("RandomChoiceWithMask")
       .Input(self)
diff --git a/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp b/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp
index ae77cd8eaf9b2de87f1615aa7a06c7233aa48e97..3445ba27e8e0a795c0197796d457ca8ff08836c0 100644
--- a/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RandomKernelNpu.cpp
@@ -15,8 +15,7 @@
 // limitations under the License.
 
 #include <limits.h>
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -42,9 +41,7 @@ Tensor& random_npu_(Tensor& self, int64_t from, int64_t to, Generator* gen_) {
     selfCopy = self.npu_dtype_cast(ScalarType::Float);
   }
 
-  SmallVector<Tensor, N> inputs = {selfCopy};
-  SmallVector<Tensor, N> outputs = {selfCopy};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({selfCopy}, {selfCopy});
 
   if (!NpuUtils::check_match(&selfCopy)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(selfCopy);
diff --git a/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp b/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp
index bb063be6586a0288bcdf160fce4c477227b90916..4b638919a88b0c48db9cc5a535c3d9f344bb28a3 100644
--- a/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RangeKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp b/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp
index fff9bcb7010a35ac9ec26198eab9a2fd37b7b4a7..ed546a6685669bd6f3bf59b7285d52ba9fea24db 100644
--- a/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ReciprocalKernelNpu.cpp
@@ -34,9 +34,7 @@ Tensor& reciprocal_out_npu(Tensor& result, const Tensor& self) {
   OpPreparation::CheckOut(
       {self},
       result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      self.sizes());
+      self);
 
   OpPipeWithDefinedOut pipe;
   return pipe.CheckMemory({self}, {result})
@@ -46,9 +44,7 @@ Tensor& reciprocal_out_npu(Tensor& result, const Tensor& self) {
 
 Tensor reciprocal_npu(const Tensor& self) {
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   reciprocal_out_npu_nocheck(result, self);
 
diff --git a/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp b/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp
index 54bd128ef8b417e68a4282d71dc392dc609dd348..62f91fd927a542df6366aa4059b4af75af37e72c 100644
--- a/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RemainderKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp b/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp
index eba096f9c3427b8aa97e2800026fcb24111f59eb..4c41f0dc912229c7ba7ac4dabbcd77161b4766c5 100644
--- a/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RepeatKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "c10/npu/OptionsManager.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/RollKernelNpu.cpp b/src/aten/src/ATen/native/npu/RollKernelNpu.cpp
index b392679e4a858021c49cdbaaf9f916fbfa573d13..4def38d46343f8f514a9cad5aeb0182e4ea92757 100644
--- a/src/aten/src/ATen/native/npu/RollKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RollKernelNpu.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at { 
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp b/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp
index e708ef67b6c216fc2093fe883956902ae518c0b8..996968cc691819032297f22ebd0ab5afc33899c3 100644
--- a/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RoundKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -39,16 +38,8 @@ Tensor& round_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor round_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   round_out_npu_nocheck(result, self);
-
   return result;
 }
 
diff --git a/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp b/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp
index e0c477abb0585c6109536b9e68bd4e70573c1f78..462e35ef87a714e5387aca961bef04de7c0d02f7 100644
--- a/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RsqrtKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -39,14 +38,7 @@ Tensor& rsqrt_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor rsqrt_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   rsqrt_out_npu_nocheck(result, self);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp b/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp
index e5978640872c15cbebe175ccf6e49181eef5a422..cb2fa4c663e7aadb9f0199744fabffbba433246f 100644
--- a/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/RsubKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp b/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp
index c7a0e166c0d7f42ec8fede554c131eb216f35644..86706651710f7fc91b6af77058b7380a675c8d0a 100644
--- a/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ScatterAddKernelNpu.cpp
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include<ATen/NamedTensorUtils.h>
 
 namespace at {
 namespace native {
@@ -51,11 +51,7 @@ Tensor& scatter_add_npu_(
     int64_t dim,
     const Tensor& index,
     const Tensor& src) {
-
-  SmallVector<Tensor, N> inputs = {self, index, src};
-  SmallVector<Tensor, N> outputs = {self};
-
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self, index, src}, {self});
 
   ScalarType selfType = self.scalar_type();
   Tensor selfFp32 = self;
diff --git a/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp b/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp
index 30ddb28d6ef2cab91e1c19e0238dd930330d3a8b..ad434470db778052830eef597dc33bd5a6756fff 100644
--- a/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ScatterKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -73,15 +72,12 @@ Tensor& scatter_npu_(
     index = index.npu_dtype_cast(ScalarType::Float);
   }
 
-  // get float from scalar
-  float src_value = CalcuOpUtil::get_scalar_float_value(src);
-
   OpCommand cmd;
   cmd.Name("ScatterScalar")
      .Input(index)
      .Output(self)
      .Attr("dim", dim)
-     .Attr("value", src_value)
+     .Attr("value", src)
      .Run();
 
   if(self.scalar_type() != selfType){
diff --git a/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp b/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp
index af0e46a84af3366bc87e25a56193fcbfac06d778..3485b0608a1a9d1b591dcba3fc1466e5ed388410 100644
--- a/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -40,9 +39,7 @@ Tensor& scatter_out_npu(
 }
 
 Tensor scatter_npu(const Tensor& self, const Tensor& indices, const Tensor& updates, int64_t dim) {
-  Tensor outputs = at::empty_with_format(
-      self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor outputs = OpPreparation::ApplyTensor(self);
   scatter_out_npu(outputs, self, indices, updates, dim);
 
   return outputs;
diff --git a/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp b/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp
index ad20aa78246042201366654058ba4de6a5cc9fe7..f6dc97b52a65eb9c9bac79edd3949dcfcf69921a 100644
--- a/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SeluKernelNpu.cpp
@@ -40,10 +40,7 @@ Tensor selu_npu(const Tensor& self) {
 }
 
 Tensor& selu_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = selu_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp
index d56e4c0351280b52b625cfac2b2e46b9cf577c99..a538a6182267697cb7bec019908edd0e4dfb0217 100644
--- a/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SigmoidBackwardKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp b/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp
index 785bda4421de7abce8a35cea907721c41f2bfe19..132c14639cb55c85ca783c55f8385e603cc87d27 100644
--- a/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SigmoidKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/SignKernelNpu.cpp b/src/aten/src/ATen/native/npu/SignKernelNpu.cpp
index 1c2aef9b287c5261494517ca55e04f101b65979c..5f49336fc80c972d4a36319db8df22ac07c07335 100644
--- a/src/aten/src/ATen/native/npu/SignKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SignKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -32,13 +31,8 @@ Tensor& sign_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor sign_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   sign_out_npu(result, self);
 
@@ -46,10 +40,7 @@ Tensor sign_npu(const Tensor& self) {
 }
 
 Tensor& sign_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = sign_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp b/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp
index 6bbc4464c6c5c959669e97b38078eaeab7f3636a..a01380fb382a57746bbc31bbaf7f4faf2c72e476 100644
--- a/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SinhKernelNpu.cpp
@@ -40,10 +40,7 @@ Tensor sinh_npu(const Tensor& self) {
 }
 
 Tensor& sinh_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = sinh_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp b/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp
index 92b223417ad1bed35b80f592864c2e4724d39880..4779f9418561b7339feeba7e8100bb9d41652e4e 100644
--- a/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp b/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp
index 696bf6f83b8de69c71e711ce6688c248c5fb711f..4a269947501b1b1d5cb7f478736934b0e2110948 100644
--- a/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SmoothL1LossKernelNpu.cpp
@@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -69,8 +69,7 @@ Tensor smooth_l1_loss_npu(
   auto outputSize = smooth_l1_loss_npu_output_size(self, target, reduction);
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
 
   // calculate the output result of the NPU
   smooth_l1_loss_out_npu_nocheck(result, self, target, reduction);
diff --git a/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp
index b27d36380fe6ad5762b79c059708d12779f79fcb..60a6b4cbf43406d9562de8f3059ff6e1e1859308 100644
--- a/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -52,12 +51,7 @@ Tensor soft_margin_loss_backward_npu(
     const Tensor& input,
     const Tensor& target,
     int64_t reduction) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(input);
-
-  // construct the output tensor of the NPU
-  Tensor grad_input = at::empty_with_format(
-      outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input));
+  Tensor grad_input = OpPreparation::ApplyTensor(input);
   soft_margin_loss_backward_out_npu(
       grad_input, grad_output, input, target, reduction);
   return grad_input;
diff --git a/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp
index c4ca2324de37789f84937a4d542232314137daa0..6db238b5b45041db5ce3dab6539ef3a726a20d47 100644
--- a/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SoftmaxBackwardKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp
index 42d2b41ebf9db46c58066e717f6f1d6230b8faf5..cf76b1c9a0e010721c444bc160c02ee372ae0240 100644
--- a/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SoftmaxKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/SortKernelNpu.cpp b/src/aten/src/ATen/native/npu/SortKernelNpu.cpp
index 92648200c0319d5291117720acbc282e8585bab1..cae5fe981dfa683cc0358c3c4a350b3b483963b0 100644
--- a/src/aten/src/ATen/native/npu/SortKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SortKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp b/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp
index ce51176358b0d02db526af0511599f1d4edf60b8..924d3e511e8169eaf583a6b86b2355fc292cbe8d 100644
--- a/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SortWithoutIndicesKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -43,8 +42,7 @@ Tensor sort_without_indices_npu(
     bool descending) {
   auto outputSize = input_same_output_size(self);
 
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self);
   
   sort_without_indices_out_npu(result, self, dim, descending);
   
diff --git a/src/aten/src/ATen/native/npu/StackKernelNpu.cpp b/src/aten/src/ATen/native/npu/StackKernelNpu.cpp
index 4a774c0aad05fdbfee6dedaf9de9e42af285aa08..16973b844e89100799877bfd9af662935669bbda 100644
--- a/src/aten/src/ATen/native/npu/StackKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/StackKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
index 7d6490ffcc7ea6354b782d082ce9be6a2c4d823d..c919f9cd027da080114d99707dc58187f159e3a9 100644
--- a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
@@ -15,36 +15,26 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at { 
 namespace native {
 using namespace at::native::npu;
 
-Tensor& std_out_npu(
-    Tensor& result, 
-    const Tensor& self, 
-    DimnameList dim, 
-    bool unbiased, 
-    bool keepdim) {
-  return std_out_npu(result, self, dimnames_to_positions(self, dim), unbiased, keepdim);
-}
-
-Tensor& std_out_npu(
-    Tensor& result, 
+tuple<Tensor&, Tensor&> std_mean_out_npu_nocheck(
+    Tensor& resultStd, 
+    Tensor& resultMean, 
     const Tensor& self, 
     IntArrayRef dim, 
     bool unbiased, 
     bool keepdim) {
-  auto outputSize = std_npu_output_size(self, dim, keepdim);
-  Tensor meanResult = OpPreparation::ApplyTensor(self, std::get<1>(outputSize));
-
-  // executing the NPU operator
-  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
+  // executing the NPU operator 
+  if (!c10::npu::OptionsManager::CheckDynamicEnable()) { 
     OpCommand cmd;
     cmd.Name("ReduceStd")
         .Input(self)
-        .Output(result)
-        .Output(meanResult)
+        .Output(resultStd)
+        .Output(resultMean)
         .Attr("dim", dim)
         .Attr("unbiased", unbiased)
         .Attr("keepdim", keepdim)
@@ -53,25 +43,60 @@ Tensor& std_out_npu(
     OpCommand cmd1;
     cmd1.Name("ReduceMeanD")
         .Input(self)
-        .Output(meanResult)
+        .Output(resultMean)
         .Attr("axes", dim)
         .Attr("keep_dims", keepdim)
         .Run();
-    if (meanResult.dim() != 0 && keepdim == false) {
-      meanResult = meanResult.unsqueeze(dim[0]);
+    Tensor resultMeanCopy = resultMean;
+    if (resultMean.dim() != 0 && keepdim == false) {
+      auto dimVector = array_to_small_vector(dim);
+      std::sort(dimVector.begin(), dimVector.end());
+      for (int64_t i = 0; i < dimVector.size(); i++) {
+        resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]);
+      }
     }
-    Tensor meanResult2 = meanResult.expand(self.sizes());
+    resultMeanCopy = resultMeanCopy.expand(self.sizes());
     OpCommand cmd2;
     cmd2.Name("ReduceStdWithMean")
         .Input(self)
-        .Input(meanResult2)
-        .Output(result)
+        .Input(resultMeanCopy)
+        .Output(resultStd)
         .Attr("dim", dim)
         .Attr("unbiased", unbiased)
         .Attr("keepdim", keepdim)
         .Run();
   }
 
+  return std::tie(resultStd, resultMean);
+}
+
+Tensor& std_out_npu(
+    Tensor& result, 
+    const Tensor& self, 
+    DimnameList dim, 
+    bool unbiased, 
+    bool keepdim) {
+  return std_out_npu(result, self, dimnames_to_positions(self, dim), unbiased, keepdim);
+}
+
+Tensor& std_out_npu(
+    Tensor& result, 
+    const Tensor& self, 
+    IntArrayRef dim, 
+    bool unbiased, 
+    bool keepdim) {
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+  Tensor meanResult = OpPreparation::ApplyTensor(self, outputSize);
+
+  OpPreparation::CheckOut(
+      {self}, 
+      result, 
+      self,
+      outputSize);
+
+  // executing the NPU operator
+  std_mean_out_npu_nocheck(result, meanResult, self, dim, unbiased, keepdim);
+
   return result;
 }
 
@@ -82,40 +107,21 @@ tuple<Tensor&, Tensor&> std_mean_out_npu(
     IntArrayRef dim, 
     bool unbiased, 
     bool keepdim) {
-  // executing the NPU operator 
-  if (!c10::npu::OptionsManager::CheckDynamicEnable()) { 
-    OpCommand cmd;
-    cmd.Name("ReduceStd")
-        .Input(self)
-        .Output(result1)
-        .Output(result2)
-        .Attr("dim", dim)
-        .Attr("unbiased", unbiased)
-        .Attr("keepdim", keepdim)
-        .Run();
-  } else {
-    OpCommand cmd1;
-    cmd1.Name("ReduceMeanD")
-        .Input(self)
-        .Output(result2)
-        .Attr("axes", dim)
-        .Attr("keep_dims", keepdim)
-        .Run();
-    Tensor result2_copy = result2;
-    if (result2.dim() != 0 && keepdim == false) {
-      result2_copy = result2.unsqueeze(dim[0]);
-    }
-    result2_copy = result2_copy.expand(self.sizes());
-    OpCommand cmd2;
-    cmd2.Name("ReduceStdWithMean")
-        .Input(self)
-        .Input(result2_copy)
-        .Output(result1)
-        .Attr("dim", dim)
-        .Attr("unbiased", unbiased)
-        .Attr("keepdim", keepdim)
-        .Run();
-  }
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+
+  OpPreparation::CheckOut(
+      {self}, 
+      result1, 
+      self,
+      outputSize);
+  OpPreparation::CheckOut(
+      {self}, 
+      result2, 
+      self,
+      outputSize);
+      
+  // executing the NPU operator
+  std_mean_out_npu_nocheck(result1, result2, self, dim, unbiased, keepdim);
 
   return std::tie(result1, result2);
 }
@@ -126,11 +132,11 @@ Tensor std_dim_npu(
     bool unbiased, 
     bool keepdim) {
   // calculate the output size
-  auto outputSize = std_npu_output_size(self, dim, keepdim);
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
 
   // construct the output tensor of the NPU
-  Tensor result1 = OpPreparation::ApplyTensor(self, std::get<0>(outputSize));
-  Tensor result2 = OpPreparation::ApplyTensor(self, std::get<1>(outputSize));
+  Tensor result1 = OpPreparation::ApplyTensor(self, outputSize);
+  Tensor result2 = OpPreparation::ApplyTensor(self, outputSize);
 
   // calculate the output result of the NPU
   std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim);
@@ -157,11 +163,11 @@ tuple <Tensor, Tensor> std_mean_dim_npu(
     bool unbiased, 
     bool keepdim) {
   // calculate the output size
-  auto outputSize = std_npu_output_size(self, dim, keepdim);
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
 
   // construct the output tensor of the NPU
-  Tensor result1 = OpPreparation::ApplyTensor(self, std::get<0>(outputSize));
-  Tensor result2 = OpPreparation::ApplyTensor(self, std::get<1>(outputSize));
+  Tensor result1 = OpPreparation::ApplyTensor(self, outputSize);
+  Tensor result2 = OpPreparation::ApplyTensor(self, outputSize);
 
   // calculate the output result of the NPU
   std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim);
diff --git a/src/aten/src/ATen/native/npu/SubKernelNpu.cpp b/src/aten/src/ATen/native/npu/SubKernelNpu.cpp
index 70872bda6d027933ee902325e357cb93bb8d4f4d..de4fc4676fa056e02cccd602e8f844cd2d1231db 100644
--- a/src/aten/src/ATen/native/npu/SubKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SubKernelNpu.cpp
@@ -15,8 +15,8 @@
 // limitations under the License.
 
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp b/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp
index 3af8d7df4bb761879fc5f84c971af843d2a45356..de393e901ee9f05d233edbc37cd471e23dcf9fc6 100644
--- a/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SubSampleKernelNpu.cpp
@@ -20,9 +20,7 @@ using namespace at::native::npu;
 
 Tensor sub_sample_npu(const Tensor &self, int64_t per_images,
                       double positive_fraction) {
-  Tensor result = at::empty_with_format(
-      self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   OpCommand cmd;
   cmd.Name("SubSample")
       .Input(self)
@@ -30,7 +28,6 @@ Tensor sub_sample_npu(const Tensor &self, int64_t per_images,
       .Attr("batch_size_per_images", per_images)
       .Attr("positive_fraction", (float)positive_fraction)
       .Run();
-
   return result;
 }
 
diff --git a/src/aten/src/ATen/native/npu/SumKernelNpu.cpp b/src/aten/src/ATen/native/npu/SumKernelNpu.cpp
index 739d5992bb6d949d66ea6697304d6e4b54126b35..9a8f72225f8801a2682ba7dc161358ad39633407 100644
--- a/src/aten/src/ATen/native/npu/SumKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SumKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -116,7 +117,7 @@ Tensor& sum_out_npu(
   OpPreparation::CheckOut(
       {self}, 
       result, 
-      CalcuOpUtil::get_tensor_npu_format(self),
+      ACL_FORMAT_ND,
       dstType,
       outputSize);
 
diff --git a/src/aten/src/ATen/native/npu/TanKernelNpu.cpp b/src/aten/src/ATen/native/npu/TanKernelNpu.cpp
index 06194c74a806b0d71bf3c1bb2363b66b27b13c24..316263404daf412bb2abb82afc7ea7b66763ad30 100644
--- a/src/aten/src/ATen/native/npu/TanKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TanKernelNpu.cpp
@@ -32,23 +32,13 @@ Tensor& tan_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor tan_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   tan_out_npu(result, self);
   return result;
 }
 
 Tensor& tan_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = tan_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp
index 5ce4d71e14f19be5aa3211fc1b2bb7a04108dc4d..2b2328ee63461383851a48b7bc6188e346f0cec7 100644
--- a/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TanhBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -45,9 +44,7 @@ Tensor& tanh_backward_out_npu(
 }
 
 Tensor tanh_backward_npu(const Tensor& grad_output, const Tensor& self) {
-  Tensor result = at::empty_with_format(
-    self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   tanh_backward_out_npu_nocheck(result, grad_output, self);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp b/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp
index 07dd8c7a13d08922ce5c58dc6b7ea87019c72d68..98bbd5a5a5f47131c5c5a83c2165022b2edf3731 100644
--- a/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TanhKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -32,13 +31,7 @@ Tensor& tanh_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor tanh_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self);
   // calculate the output result of the NPU
   tanh_out_npu(result, self);
 
@@ -46,10 +39,7 @@ Tensor tanh_npu(const Tensor& self) {
 }
 
 Tensor& tanh_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = tanh_out_npu(contiguousSelf, contiguousSelf);
diff --git a/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp
index 6df88d299822108da597c78ab0d0e1f198ef8bb8..ca01f2dd61ecd41e8111031f470374a1f1c479fe 100644
--- a/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ThresholdBackwardKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp b/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp
index a90f3f4ae5c362650b772e82081ad187d2334769..103b4fa043623d0bc1bebf0e9069a40306ce2896 100644
--- a/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ThresholdKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -38,14 +37,7 @@ Tensor& threshold_out_npu(
 }
 
 Tensor threshold_npu(const Tensor& self, Scalar threshold, Scalar value) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   threshold_out_npu(result, self, threshold, value);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp b/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp
index 5b6ac1f3e4bcac131f87a988fb50914d4faa2bdd..4ef0a3fc43be85036fe7bb67071c499d06a6e7da 100644
--- a/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TopKKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp b/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp
index e9e0730b7971e9be91e5fbb52012bab0f104bf86..3429850d8eaec7fb747507bd8d3706777778cea3 100644
--- a/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TransposeKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include <torch/csrc/autograd/record_function.h>
 
 namespace at {
@@ -49,14 +50,8 @@ Tensor& transpose_out_npu(
 }
 
 Tensor transpose_npu(const Tensor& self, IntArrayRef perm) {
-  // calculate the output size
   auto outputSize = transpose_npu_output_size(self, perm);
- 
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
   transpose_out_npu(result, self, perm);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp b/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp
index f8faa79eb1cd8522eae395b8fa42ea4c0ca3f223..678854c4a391045571ecfa69175acda9d8ecdde1 100644
--- a/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/OpTemplate.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -42,19 +41,13 @@ Tensor tril_npu(const Tensor& self, int64_t diagonal){
   };
   
   TORCH_CHECK(is_last_two_dims(), "tril require tensor should be last two dims");
-  
-  auto outputSize = input_same_output_size(selfCopy);
-  Tensor result = at::empty_with_format(outputSize,selfCopy.options(),
-                      CalcuOpUtil::get_tensor_npu_format(selfCopy));
+  Tensor result = OpPreparation::ApplyTensor(selfCopy);
   tril_out_npu(result, selfCopy, diagonal);
   return result;
 }
 
 Tensor& tril_npu_(Tensor& self, int64_t diagonal){
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};  
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-  
+  OpPreparation::CheckMemory({self}, {self});  
   self.npu_format_cast_(ACL_FORMAT_NCHW);
   if(!NpuUtils::check_match(&self)){
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
diff --git a/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp b/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp
index a9957ab8137cabeaae9c9a8ca8cf59ef77187997..6de531b25e89c6a1c01760636f9d2fbc54dde175 100644
--- a/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TruncKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -32,10 +31,7 @@ Tensor& trunc_out_npu(Tensor& result, const Tensor& self) {
 }
 
 Tensor& trunc_npu_(Tensor& self) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
+  OpPreparation::CheckMemory({self}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = trunc_out_npu(contiguousSelf, contiguousSelf);
@@ -48,16 +44,7 @@ Tensor& trunc_npu_(Tensor& self) {
 }
 
 Tensor trunc_npu(const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, 
-      self.options(), 
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   trunc_out_npu(result, self);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp b/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp
index 35814aab9614e1dd793b1dc25afb5969197adb91..ef248af128771c7bbe1e42b20ef5d3465f1d5e7b 100644
--- a/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/UniformKernelNpu.cpp
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -37,9 +37,7 @@ Tensor& uniform_out_npu(
 }
 
 Tensor& uniform_npu_(Tensor& self, double from, double to, Generator* gen_) {
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+  OpPreparation::CheckMemory({self}, {self});
 
   // TODO: The operator needs to use fp32 for calculation.
   Tensor selfCopy = self;
diff --git a/src/aten/src/ATen/native/npu/UpSampleNearest3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpSampleNearest3dKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e90e7f0df5a677eb1504c294b732a0f9b3a36676
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/UpSampleNearest3dKernelNpu.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& upsample_nearest3d_out_npu(
+    Tensor& result,
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+
+  result.resize_({nbatch, channels, output_depth, output_height, output_width});
+
+  OpCommand cmd;
+  cmd.Name("UpsampleNearest3d")
+    .Input(input)
+    .Output(result)
+    .Attr("output_size", output_size)
+    .Run();
+  
+  return result;
+}
+
+Tensor upsample_nearest3d_npu(
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+
+  Tensor result = OpPreparation::ApplyTensor(input, {1});
+
+  upsample_nearest3d_out_npu(result, input, output_size, scales_d, scales_h, scales_w);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp
index c688e5496b12770d397ebd99aef5d449c90b8884..de10ef61d6d712b82406dad08c52ca834aa63f1d 100644
--- a/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/UpsampleBilinear2dKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp
index a4b93e533016bf91660c7981710dcdeccd718ad5..102716632995ba88837c3c7e8e41f392183e7ded 100644
--- a/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/UpsampleNearest1dBackwardKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -51,8 +50,7 @@ Tensor upsample_nearest1d_backward_npu(
     grads = grad_output.to(at::kFloat);
   }
 
-  Tensor grad_input = at::empty_with_format(
-      input_size, grads.options(), CalcuOpUtil::get_tensor_npu_format(grad_output));
+  Tensor grad_input = OpPreparation::ApplyTensor(input_size, grads.options(), grad_output);
 
   upsample_nearest1d_backward_out_npu(
       grad_input, grads, output_size, input_size, scales);
diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp
index bd1596502bee4ae59fdedfc966dfc93e0bfc1e75..944f92fd8295f24439392940b7e7bd8e8fcbaa7f 100644
--- a/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -67,8 +66,7 @@ Tensor upsample_nearest1d_npu(
   SmallVector<int64_t, SIZE> outputSize = upsample_nearest1d_npu_output_size(self, output_size, scales);
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
 
   // calculate the output result of the NPU
   upsample_nearest1d_out_npu(result, self, output_size, scales);
diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp
index 9c5adc9af1d706bce4fe665cf9effd8bed3703b7..09ec3e2e28416ff7e72f9cb5289903c987cc58f5 100644
--- a/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/UpsampleNearest2dBackwardKernelNpu.cpp
@@ -51,8 +51,8 @@ Tensor upsample_nearest2d_backward_npu(
     grads = grad_output.to(at::kFloat);
   }
 
-  Tensor grad_input = at::empty_with_format(
-      input_size, grads.options(), CalcuOpUtil::get_tensor_npu_format(grad_output));
+  Tensor grad_input = OpPreparation::ApplyTensor(
+      input_size, grads.options(), grad_output);
 
   upsample_nearest2d_backward_out_npu(
       grad_input, grads, output_size, input_size, scales_h, scales_w);
diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp
index d759bd0211bde5bb63f38a17cda711768bf34ad1..98626777078f901881f4465dd280d6e1feeeee02 100644
--- a/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/UpsampleNearest2dKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest3dBackwardKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a9d724da90aba4db7629bd5487ef20e7c812eb6b
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/UpsampleNearest3dBackwardKernelNpu.cpp
@@ -0,0 +1,82 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& upsample_nearest3d_backward_out_npu(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 5,
+      "It is expected input_size equals to 5, but got size ",
+      input_size.size());
+
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_depth = input_size[2];
+  int64_t input_height = input_size[3];
+  int64_t input_width = input_size[4];
+
+  grad_input.resize_(
+      {nbatch, channels, input_depth, input_height, input_width});
+
+  OpCommand cmd;
+  cmd.Name("UpsampleNearest3dGrad")
+    .Input(grad_output)
+    .Output(grad_input)
+    .Attr("input_size", input_size)
+    .Attr("output_size", output_size)
+    .Run();
+
+  return grad_input;
+}
+
+Tensor upsample_nearest3d_backward_npu(
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+
+  Tensor grad_input = OpPreparation::ApplyTensor(grad_output, input_size);
+
+  upsample_nearest3d_backward_out_npu(grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w);
+
+  return grad_input;
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/UpsampleTrilinear3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dBackwardKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1a847ef2bd3007d20dfabc8f6da3e27654772e86
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dBackwardKernelNpu.cpp
@@ -0,0 +1,85 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& upsample_trilinear3d_backward_out_npu(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 5,
+      "It is expected input_size equals to 5, but got size ",
+      input_size.size());
+
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_depth = input_size[2];
+  int64_t input_height = input_size[3];
+  int64_t input_width = input_size[4];
+
+  grad_input.resize_(
+      {nbatch, channels, input_depth, input_height, input_width});
+
+  OpCommand cmd;
+  cmd.Name("UpsampleTrilinear3dGrad")
+    .Input(grad_output)
+    .Output(grad_input)
+    .Attr("input_size", input_size)
+    .Attr("output_size", output_size)
+    .Attr("align_corners", align_corners)
+    .Run();
+
+  return grad_input;
+}
+
+Tensor upsample_trilinear3d_backward_npu(
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+
+  Tensor grad_input = OpPreparation::ApplyTensor(grad_output, input_size);
+
+  upsample_trilinear3d_backward_out_npu(grad_input, grad_output, output_size, input_size, align_corners, scales_d, scales_h, scales_w);
+
+  return grad_input;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/UpsampleTrilinear3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..919c3fcdf768fa517e7e4a5e9fd29dfd71c898a1
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/UpsampleTrilinear3dKernelNpu.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& upsample_trilinear3d_out_npu(
+    Tensor& result,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+
+  result.resize_({nbatch, channels, output_depth, output_height, output_width});
+    
+  OpCommand cmd;
+  cmd.Name("UpsampleTrilinear3d")
+    .Input(input)
+    .Output(result)
+    .Attr("output_size", output_size)
+    .Attr("align_corners", align_corners)
+    .Run();
+  
+  return result;
+}
+
+Tensor upsample_trilinear3d_npu(
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    c10::optional<double> scales_d,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+
+  Tensor result = OpPreparation::ApplyTensor(input, {1});
+
+  upsample_trilinear3d_out_npu(result, input, output_size, align_corners, scales_d, scales_h, scales_w);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp b/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp
index fcc771e143b9842874dfd898e8020c7739a644b8..9c025b9650ebc3d8d810080d207aa26045a76b11 100644
--- a/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/WhereKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -49,13 +48,7 @@ Tensor _s_where_npu(
     const Tensor& condition,
     const Tensor& self,
     const Tensor& other) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  
+  Tensor result = OpPreparation::ApplyTensor(self);
   // maskrcnn need dynamicshape function of op "SelectV2"
   string opName = c10::npu::OptionsManager::CheckDynamicEnable() ? "SelectV2" : "Select";
   OpCommand cmd;
diff --git a/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp
index 0fe3fc20d89f600246810f46089793ed12dffaf4..c5c29cbb9640c2f03824493519f6a84a42a36fb9 100644
--- a/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ZerosLikeKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -39,13 +38,7 @@ Tensor zeros_like_npu(
     auto result = at::empty_like(self, options, optional_memory_format);
     return result.fill_(0);
   }
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, options, CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self, options);
   // calculate the output result of the NPU
   return result.zero_();
 }
diff --git a/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp b/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp
index df196cfe45fde948b8db96b44ff9e08d5cd20d8d..757bc2e258b143db8fa7b2928c10c4a8357c2feb 100644
--- a/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -49,9 +48,9 @@ tuple<Tensor, Tensor, Tensor> _unique2_npu(
     bool return_inverse,
     bool return_counts) {
   if(self.numel() == 0){
-    Tensor result= at::empty_with_format({0}, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-    Tensor yInverse = at::empty_with_format({0}, self.options().dtype(kLong), CalcuOpUtil::get_tensor_npu_format(self));
-    Tensor yCounts = at::empty_with_format({0}, self.options().dtype(kLong), CalcuOpUtil::get_tensor_npu_format(self));
+    Tensor result= OpPreparation::ApplyTensor(self, {0});
+    Tensor yInverse = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self);
+    Tensor yCounts = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self);
     return std::tie(result, yInverse, yCounts);
   }
   
@@ -64,7 +63,7 @@ tuple<Tensor, Tensor, Tensor> _unique2_npu(
     selfCopy = self.to(ScalarType::Float);
   }
  
-  Tensor y = at::empty_with_format(std::get<0>(outputSizes), selfCopy.options(), CalcuOpUtil::get_tensor_npu_format(selfCopy));
+  Tensor y = OpPreparation::ApplyTensor(selfCopy, std::get<0>(outputSizes));
   Tensor yOutputSize = at::empty_with_format(std::get<1>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
   Tensor yInverse = at::empty_with_format(std::get<2>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
   Tensor yCounts = at::empty_with_format(std::get<0>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
diff --git a/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp b/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp
index 567f0080fc3dfbd27d9ceb79ef2453453f8ef627..5eab51b18541686083fdbbee3b288eaadeccbdb1 100644
--- a/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/__And__KernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -71,25 +71,14 @@ Tensor __and___npu(const Tensor& self, const Tensor& other) {
   auto outputSize = broadcast_ops_npu_output_size(self, other);
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize,
-      outputTensor.options(),
-      CalcuOpUtil::get_tensor_npu_format(outputTensor));
-
+  Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize);
   // calculate the output result of the NPU
   __and___out_npu(result, self, other);
   return result;
 }
 
 Tensor __and___npu(const Tensor& self, Scalar other) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   __and___out_npu(result, self, other);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp b/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp
index c54448b8c8f6b9045ac494a8d799513b72171c31..e517cdaf856578e4557e7fb0b85f7c591332f8e4 100644
--- a/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/__Ior__KernelNpu.cpp
@@ -41,10 +41,7 @@ Tensor& __ior___out_npu(Tensor& result, const Tensor& self, Scalar other) {
 }
 
 Tensor& __ior___npu(Tensor& self, const Tensor& other) { 
-  SmallVector<Tensor, N> inputs = {self, other};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-    
+  OpPreparation::CheckMemory({self, other}, {self});
   if (!NpuUtils::check_match(&self)) {
     Tensor contiguousSelf = NpuUtils::format_contiguous(self);
     Tensor result = __ior___out_npu(contiguousSelf, contiguousSelf, other);
diff --git a/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp b/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp
index 808024748fd4c749f9da0bcd9dd3e102b485b8e1..89e628e1dc3fcc33113d7ca7e53dffedbe9f37a0 100644
--- a/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/__Or__KernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
diff --git a/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp b/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp
index 79e9f5af3c55f40a2ff43046c94147a67534ed53..15e9827a281c7c595335e4354a5a9a302cdcb1ed 100644
--- a/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/common/MatmulByBmmV2KernelNpu.cpp
@@ -12,63 +12,28 @@ using namespace at::native::npu;
   if (dim_tensor1 == 1 && dim_tensor2 == 1) {
     return tensor1.dot(tensor2);
   } else if (dim_tensor1 == 2 && dim_tensor2 == 1) {
-    return tensor1.mv(tensor2);
+    return tensor1.mm(tensor2.unsqueeze(-1)).squeeze_(-1);
   } else if (dim_tensor1 == 1 && dim_tensor2 == 2) {
     return tensor1.unsqueeze(0).mm(tensor2).squeeze_(0);
   } else if (dim_tensor1 == 2 && dim_tensor2 == 2) {
     return tensor1.mm(tensor2);
   } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) {
     Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2;
-    Tensor output = dim_tensor2 == 1 ? at::npu_bmmV2(tensor1, t2).squeeze(-1) : at::npu_bmmV2(tensor1, t2);
+    auto size1 = tensor1.sizes();
+    auto size2 = t2.sizes();
+    std::vector<int64_t> output_size;
+    output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
+    if (dim_tensor2 > 1) {
+      output_size.push_back(size2[dim_tensor2 - 1]);
+    }
+    // fold the batch into the first dimension
+    Tensor t1 = tensor1.reshape({-1, tensor1.size(-1)});
+    Tensor output = at::_unsafe_view(t1.mm(t2), output_size);
     return output;
-    // 需要支持out接口
   } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) {
-    // optimization: transpose the inner dimensions of the arguments, call
-    // matmul on the swapped arguments, then transpose the inner dimensions
-    // of the result.
-    const int64_t n = dim_tensor1 == 2 ? tensor1.size(-2) : 1;
-    const int64_t m = tensor1.size(-1);
-    const int64_t p = tensor2.size(-1);
-
-    const Tensor t2_T = tensor2.transpose(-1, -2);
-    const Tensor t1_T = dim_tensor1 == 2 ? tensor1.t() : tensor1.reshape({n, m}).t();
-    const Tensor res_T = matmul_npu(t2_T, t1_T);
-
-    if (dim_tensor1 == 2) {
-      Tensor res = res_T.transpose(-1, -2).contiguous();
-      return res;
-    }
-    else {
-      std::vector<int64_t> shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec();
-      shape.push_back(p);
-
-      Tensor res = res_T.reshape(shape).contiguous();
-      return res;
-    }
+    return at::npu_bmmV2(tensor1, tensor2, {});
   } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) {
-      int64_t n = dim_tensor1 > 1 ? tensor1.size(-2) : 1;
-    int64_t m1 = tensor1.size(-1);
-    IntArrayRef batch_tensor1(tensor1.sizes().data(), std::max<int64_t>(dim_tensor1 - 2, 0));
-    int64_t m2 = dim_tensor2 > 1 ? tensor2.size(-2) : 1;
-    int64_t p = tensor2.size(-1);
-    IntArrayRef batch_tensor2(tensor2.sizes().data(), std::max<int64_t>(dim_tensor2 - 2, 0));
-
-    // expand the batch portion (i.e. cut off matrix dimensions and expand rest)
-    std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
-
-    std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
-    tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1});
-
-    std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
-    tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p});
-
-
-    // flatten expanded batches
-    Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size);
-    Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size);
-
-    Tensor output = at::npu_bmmV2(tensor1_expanded, tensor2_expanded);
-    return output;
+    return at::npu_bmmV2(tensor1, tensor2, {});
   }
   AT_ERROR("both arguments to matmul need to be at least 1D, but they are ",
           dim_tensor1, "D and ", dim_tensor2, "D");
diff --git a/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp
index 4374fc1c7ed5dee107f6c0308e502ddfbc06608a..f1cd79906cbef0e93bd36a73524de2bba9a988a6 100644
--- a/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp
@@ -90,16 +90,9 @@ Tensor &conv3d_out_npu(Tensor &result, const Tensor &input,
 Tensor conv3d_npu(const Tensor &input, const Tensor &weight, const Tensor &bias,
                   IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
                   int64_t groups) {
-  // calculate the output size
-
   auto outputSize = conv3d_npu_output_size(
       input, weight, bias, stride, padding, dilation, groups);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, input.options(), CalcuOpUtil::get_tensor_npu_format(input));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(input, outputSize);
   conv3d_out_npu(result, input, weight, bias, stride, padding, dilation, groups);
 
   return result;
diff --git a/src/aten/src/ATen/native/npu/frame/OpCommandBase.h b/src/aten/src/ATen/native/npu/frame/OpCommandBase.h
index 140170f641de335337b89d9defc7aee5c1c48bab..602b3d7ca733f750f3121e0ce039fbd7ed61c3df 100644
--- a/src/aten/src/ATen/native/npu/frame/OpCommandBase.h
+++ b/src/aten/src/ATen/native/npu/frame/OpCommandBase.h
@@ -20,7 +20,6 @@
 #include "ATen/native/npu/frame/OpCmdHelper.h"
 #include "ATen/native/npu/frame/OpParamMaker.h"
 #include "ATen/native/npu/utils/DynamicShapeUtil.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
 #include "THNPU/THNPUCachingHostAllocator.h"
 namespace at {
@@ -121,7 +120,6 @@ class OpCommandBase {
   }
 
   void Run(){
-    NpuUtils::SetCompileOptOnce();
     if (c10::npu::OptionsManager::CheckQueueEnable()) {
       ExecuteParas params;
       aclCmd->ExportParams(params);
@@ -211,11 +209,19 @@ class OpCommandBase {
     return storage.back();
   }
   Tensor CopyHostToDevice(const Scalar& scalar, ScalarType type) {
-    storage.emplace_back(CalcuOpUtil::CopyScalarToDevice(scalar, type));
-    return storage.back();
+    auto tensor = scalar_to_tensor(scalar).to(type);
+    return CopyHostToDevice(tensor);
   }
   Tensor CopyHostToDevice(const Tensor& cpuTensor) {
-    storage.emplace_back(CalcuOpUtil::copy_tensor_host_to_device(cpuTensor));
+    Tensor cpuPinMemTensor = cpuTensor.pin_memory();
+    int deviceIndex = 0;
+    AT_NPU_CHECK(aclrtGetDevice(&deviceIndex));
+    auto tensor = cpuPinMemTensor.to(
+      c10::Device(DeviceType::NPU, deviceIndex),
+      cpuPinMemTensor.scalar_type(),
+      true,
+      true);
+    storage.emplace_back(tensor);
     return storage.back();
   }
   Tensor CreateHostTensor(void* data, IntArrayRef sizes,
diff --git a/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h b/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h
index 7cc10880015104739a74e9f2de44e8e5e153f5e6..0fd4d6695499d73d53ac82a2c809dd461f7cf8df 100644
--- a/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h
+++ b/src/aten/src/ATen/native/npu/frame/OpDynamicParamMaker.h
@@ -19,8 +19,6 @@
 #include <third_party/acl/inc/acl/acl_base.h>
 #include <third_party/acl/inc/acl/acl_op_compiler.h>
 #include "ATen/native/npu/frame/OpParamMaker.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
 #include "c10/npu/NPUStream.h"
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp
index d285f2b6711ffa85698164f3f2fd46685c2729b3..73dcdd8ec5cccb9633c455dee0a320dd05ea70cc 100644
--- a/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp
+++ b/src/aten/src/ATen/native/npu/frame/OpParamMaker.cpp
@@ -17,15 +17,131 @@
 #include <c10/npu/OptionsManager.h>
 #include "c10/npu/NPUQueue.h"
 #include <torch/csrc/autograd/record_function.h>
-#include "ATen/native/npu/utils/DynamicShapeUtil.h"
 #include "ATen/native/npu/aoe/AutoTune.h"
+#include "ATen/native/npu/utils/DynamicShapeUtil.h"
 #include "ATen/native/npu/utils/NpuFuzzyBlacklist.h"
-#include "ATen/native/GlobalStep.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/interface/EnvVariables.h"
 
 namespace at {
 namespace native {
 namespace npu {
 
+void OpAttrMaker::Set(aclopAttr* attr, string name, bool value) {
+  aclopSetAttrBool(attr, name.c_str(), value);
+}
+
+void OpAttrMaker::Set(aclopAttr* attr, string name, int64_t value) {
+  aclopSetAttrInt(attr, name.c_str(), value);
+}
+
+void OpAttrMaker::Set(aclopAttr* attr, string name, float value) {
+  aclopSetAttrFloat(attr, name.c_str(), value);
+}
+
+void OpAttrMaker::Set(aclopAttr* attr, string name, string value) {
+  aclopSetAttrString(attr, name.c_str(), value.c_str());
+}
+
+void OpAttrMaker::Set(aclopAttr* attr, string name, IntArrayRef value) {
+  auto vec = value.vec();
+  aclopSetAttrListInt(attr, name.c_str(), vec.size(), vec.data());
+}
+
+void OpAttrMaker::Set(aclopAttr* attr, string name, at::ArrayRef<float> value) {
+  auto vec = value.vec();
+  aclopSetAttrListFloat(attr, name.c_str(), vec.size(), vec.data());
+}
+
+void OpAttrMaker::Set(aclopAttr* attr, string name, Scalar value) {
+  float val = CalcuOpUtil::get_scalar_float_value(value);
+  aclopSetAttrFloat(attr, name.c_str(), val);
+}
+
+
+void OpAttrMaker::Set(
+      aclopAttr* attr,
+      string name,
+      at::ArrayRef<IntArrayRef> value) {
+  // Pointer to values of each listInt.
+  SmallVector<int64_t*, N> attrValue;
+  // Pointer to number of each listInt.
+  SmallVector<int, N> eachListIntNum;
+  // Value of each listInt.
+  SmallVector<SmallVector<int64_t, N>, N> eachListIntVal;
+  for (int i = 0; i < value.size(); i++) {
+    SmallVector<int64_t, N> listInt;
+    int64_t valueSize = value[i].size();
+    listInt.resize(valueSize);
+    std::copy(value[i].begin(), value[i].end(), listInt.begin());
+    eachListIntVal.emplace_back(listInt);
+    attrValue.emplace_back(eachListIntVal.back().data());
+    eachListIntNum.emplace_back(valueSize);
+  }
+
+  aclopSetAttrListListInt(
+        attr,
+        name.c_str(),
+        attrValue.size(),
+        eachListIntNum.data(),
+        attrValue.data());
+}
+
+
+void AttrInfoMaker::Add(bool value, string& attrInfo) {
+  attrInfo += to_string(value) + "-";
+}
+
+void AttrInfoMaker::Add(int64_t value, string& attrInfo) {
+  attrInfo += to_string(value) + "-";
+}
+
+void AttrInfoMaker::Add(float value, string& attrInfo) {
+  attrInfo += to_string(value) + "-";
+}
+
+void AttrInfoMaker::Add(string value, string& attrInfo) {
+  attrInfo += value + "-";
+}
+
+void AttrInfoMaker::Add(IntArrayRef value, string& attrInfo) {
+  auto vec = value.vec();
+  for (unsigned i = 0; i < vec.size(); i++)
+    attrInfo += to_string(vec.at(i)) + ",";
+  attrInfo += "-";
+}
+
+void AttrInfoMaker::Add(
+      at::ArrayRef<float> value,
+      string& attrInfo) {
+  auto vec = value.vec();
+  for (unsigned i = 0; i < vec.size(); i++)
+    attrInfo += to_string(vec.at(i)) + ",";
+  attrInfo += "-";
+}
+
+void AttrInfoMaker::Add(Scalar value, string& attrInfo) {
+  float val = CalcuOpUtil::get_scalar_float_value(value);
+  attrInfo += to_string(val) + "-";
+}
+
+void AttrInfoMaker::Add(
+    at::ArrayRef<IntArrayRef> value,
+    string& attrInfo) {
+  // Pointer to values of each listInt.
+  SmallVector<int64_t*, N> attrValue;
+  // Pointer to number of each listInt.
+  SmallVector<int, N> eachListIntNum;
+  // Value of each listInt.
+  SmallVector<SmallVector<int64_t, N>, N> eachListIntVal;
+  for (int i = 0; i < value.size(); i++) {
+    int64_t valueSize = value[i].size();
+    attrInfo += to_string(valueSize) + ",";
+  }
+  attrInfo += "-";
+}
+
+
 void OpCommandImpl::Run() {
   InitAttr();
   NPU_LOGD("Op %s Run.", opName.c_str());
@@ -40,7 +156,7 @@ aclError OpCommandImpl::InnerRun(string name, AclExecParam& params) {
   auto inputSize = params.inBuffer.size();
   auto outputSize = params.outBuffer.size();
   bool reset_flag = false;
-  if (check_fuzz_enable() &&
+  if (env::CheckFuzzyEnable() &&
       FuzzyCompileBlacklist::GetInstance().IsInBlacklist(name)) {
     aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT);
     reset_flag = true;
@@ -73,7 +189,7 @@ int ExecFunc(void* in, aclrtStream stream) {
     ret = DynamicRun(*cur_paras, stream);
   } else {
     bool reset_flag = false;
-    if (check_fuzz_enable() &&
+    if (env::CheckFuzzyEnable() &&
         FuzzyCompileBlacklist::GetInstance().IsInBlacklist(cur_paras->opType)) {
       aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT);
       reset_flag = true;
diff --git a/src/aten/src/ATen/native/npu/frame/OpParamMaker.h b/src/aten/src/ATen/native/npu/frame/OpParamMaker.h
index adf2c00f7bdd04ca7547af671069f22f58c5eb67..bf0f28830e26bb9c444fce5e492dbbb10fb5cab2 100644
--- a/src/aten/src/ATen/native/npu/frame/OpParamMaker.h
+++ b/src/aten/src/ATen/native/npu/frame/OpParamMaker.h
@@ -18,8 +18,7 @@
 
 #include <third_party/acl/inc/acl/acl_base.h>
 #include <third_party/acl/inc/acl/acl_op_compiler.h>
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
+#include "ATen/native/npu/frame/NPUDefine.h"
 #include "ATen/native/npu/interface/Graph.h"
 #include "c10/npu/NPUStream.h"
 
@@ -31,120 +30,29 @@ namespace npu {
 //
 class OpAttrMaker {
  public:
-  static void Set(aclopAttr* attr, string name, bool value) {
-    aclopSetAttrBool(attr, name.c_str(), value);
-  }
-
-  static void Set(aclopAttr* attr, string name, int64_t value) {
-    aclopSetAttrInt(attr, name.c_str(), value);
-  }
-
-  static void Set(aclopAttr* attr, string name, float value) {
-    aclopSetAttrFloat(attr, name.c_str(), value);
-  }
-
-  static void Set(aclopAttr* attr, string name, string value) {
-    aclopSetAttrString(attr, name.c_str(), value.c_str());
-  }
-
-  static void Set(aclopAttr* attr, string name, IntArrayRef value) {
-    auto vec = value.vec();
-    aclopSetAttrListInt(attr, name.c_str(), vec.size(), vec.data());
-  }
-
-  static void Set(aclopAttr* attr, string name, at::ArrayRef<float> value) {
-    auto vec = value.vec();
-    aclopSetAttrListFloat(attr, name.c_str(), vec.size(), vec.data());
-  }
-
-  static void Set(aclopAttr* attr, string name, Scalar value) {
-    float val = CalcuOpUtil::get_scalar_float_value(value);
-    aclopSetAttrFloat(attr, name.c_str(), val);
-  }
-
+  static void Set(aclopAttr* attr, string name, bool value);
+  static void Set(aclopAttr* attr, string name, int64_t value);
+  static void Set(aclopAttr* attr, string name, float value);
+  static void Set(aclopAttr* attr, string name, string value);
+  static void Set(aclopAttr* attr, string name, IntArrayRef value);
+  static void Set(aclopAttr* attr, string name, at::ArrayRef<float> value);
+  static void Set(aclopAttr* attr, string name, Scalar value);
   static void Set(
       aclopAttr* attr,
       string name,
-      at::ArrayRef<IntArrayRef> value) {
-    // Pointer to values of each listInt.
-    SmallVector<int64_t*, N> attrValue;
-    // Pointer to number of each listInt.
-    SmallVector<int, N> eachListIntNum;
-    // Value of each listInt.
-    SmallVector<SmallVector<int64_t, N>, N> eachListIntVal;
-    for (int i = 0; i < value.size(); i++) {
-      SmallVector<int64_t, N> listInt;
-      int64_t valueSize = value[i].size();
-      listInt.resize(valueSize);
-      std::copy(value[i].begin(), value[i].end(), listInt.begin());
-      eachListIntVal.emplace_back(listInt);
-      attrValue.emplace_back(eachListIntVal.back().data());
-      eachListIntNum.emplace_back(valueSize);
-    }
-
-    aclopSetAttrListListInt(
-        attr,
-        name.c_str(),
-        attrValue.size(),
-        eachListIntNum.data(),
-        attrValue.data());
-  }
+      at::ArrayRef<IntArrayRef> value);
 }; // class OpAttrMaker
 
 class AttrInfoMaker {
  public:
-  static void Add(bool value, string& attrInfo) {
-    attrInfo += to_string(value) + "-";
-  }
-
-  static void Add(int64_t value, string& attrInfo) {
-    attrInfo += to_string(value) + "-";
-  }
-
-  static void Add(float value, string& attrInfo) {
-    attrInfo += to_string(value) + "-";
-  }
-
-  static void Add(string value, string& attrInfo) {
-    attrInfo += value + "-";
-  }
-
-  static void Add(IntArrayRef value, string& attrInfo) {
-    auto vec = value.vec();
-    for (unsigned i = 0; i < vec.size(); i++)
-      attrInfo += to_string(vec.at(i)) + ",";
-    attrInfo += "-";
-  }
-
-  static void Add(
-      at::ArrayRef<float> value,
-      string& attrInfo) {
-    auto vec = value.vec();
-    for (unsigned i = 0; i < vec.size(); i++)
-      attrInfo += to_string(vec.at(i)) + ",";
-    attrInfo += "-";
-  }
-
-  static void Add(Scalar value, string& attrInfo) {
-    float val = CalcuOpUtil::get_scalar_float_value(value);
-    attrInfo += to_string(val) + "-";
-  }
-
-  static void Add(
-      at::ArrayRef<IntArrayRef> value,
-      string& attrInfo) {
-    // Pointer to values of each listInt.
-    SmallVector<int64_t*, N> attrValue;
-    // Pointer to number of each listInt.
-    SmallVector<int, N> eachListIntNum;
-    // Value of each listInt.
-    SmallVector<SmallVector<int64_t, N>, N> eachListIntVal;
-    for (int i = 0; i < value.size(); i++) {
-      int64_t valueSize = value[i].size();
-      attrInfo += to_string(valueSize) + ",";
-    }
-    attrInfo += "-";
-  }
+  static void Add(bool value, string& attrInfo);
+  static void Add(int64_t value, string& attrInfo);
+  static void Add(float value, string& attrInfo);
+  static void Add(string value, string& attrInfo);
+  static void Add(IntArrayRef value, string& attrInfo);
+  static void Add(at::ArrayRef<float> value,string& attrInfo);
+  static void Add(Scalar value, string& attrInfo);
+  static void Add(at::ArrayRef<IntArrayRef> value, string& attrInfo);
 };
 
 //
diff --git a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
index 8332005e50a493869c99778703935bb8b25b818f..1985cbffbbc309661fae435fda629967cf31f0ca 100644
--- a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
+++ b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
@@ -2,8 +2,10 @@
 
 #include "c10/npu/register/OptionRegister.h"
 #include "c10/util/Exception.h"
+#include "ATen/native/npu/utils/NpuFuzzyBlacklist.h"
+#include "ATen/native/npu/utils/NpuProfilingDispatch.h"
 #include <third_party/acl/inc/acl/acl_mdl.h>
-
+#include <third_party/acl/inc/acl/acl_op_compiler.h>
 namespace at {
 namespace native {
 namespace npu {
@@ -15,18 +17,51 @@ REGISTER_OPTION_BOOL_FUNCTION(AutoTuneEnabled, autotune, "disable", "enable")
 REGISTER_OPTION_INIT_BY_ENV(bmmv2_enable)
 REGISTER_OPTION_BOOL_FUNCTION(CheckBmmV2Enable, bmmv2_enable, "0", "1")
 
-REGISTER_OPTION(ACL_OP_DEBUG_LEVEL)
-REGISTER_OPTION(ACL_DEBUG_DIR)
-REGISTER_OPTION(ACL_OP_COMPILER_CACHE_MODE)
-REGISTER_OPTION(ACL_OP_COMPILER_CACHE_DIR)
-REGISTER_OPTION(NPU_FUZZY_COMPILE_BLACKLIST)
-
 REGISTER_OPTION_HOOK(mdldumpswitch, [](const std::string& val) { 
-  if (val == "init") { aclmdlInitDump(); }
-  else if (val == "finalize") { aclmdlFinalizeDump(); }
-  else { TORCH_CHECK(0, "set initdump value only support init or finalize, but got ", val); }
+  if (val == "enable") { aclmdlInitDump(); }
+  else { aclmdlFinalizeDump(); }
   })
 REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string& val) { aclmdlSetDump(val.c_str()); })
+
+REGISTER_OPTION_HOOK(fuzzycompileswitch, [](const std::string& val) {
+  if (val == "enable") { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ); }
+  else { aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT); }
+ })
+REGISTER_OPTION_BOOL_FUNCTION(CheckFuzzyEnable, fuzzycompileswitch, "disable", "enable")
+
+REGISTER_OPTION_HOOK(ACL_OP_DEBUG_LEVEL, [](const std::string& val) { 
+  aclSetCompileopt(aclCompileOpt::ACL_OP_DEBUG_LEVEL, val.c_str());
+ })
+REGISTER_OPTION_HOOK(ACL_DEBUG_DIR, [](const std::string& val) { 
+  aclSetCompileopt(aclCompileOpt::ACL_DEBUG_DIR, val.c_str());
+ })
+REGISTER_OPTION_HOOK(ACL_OP_COMPILER_CACHE_MODE, [](const std::string& val) { 
+  aclSetCompileopt(aclCompileOpt::ACL_OP_COMPILER_CACHE_MODE, val.c_str());
+ })
+REGISTER_OPTION_HOOK(ACL_OP_COMPILER_CACHE_DIR, [](const std::string& val) { 
+  aclSetCompileopt(aclCompileOpt::ACL_OP_COMPILER_CACHE_DIR, val.c_str());
+ })
+REGISTER_OPTION_HOOK(NPU_FUZZY_COMPILE_BLACKLIST, [](const std::string& val) { 
+  if (CheckFuzzyEnable()) {
+    FuzzyCompileBlacklist::GetInstance().RegisterBlacklist(val);
+  }
+ })
+
+ REGISTER_OPTION_INIT_BY_ENV(PROFILING_MODE)
+ REGISTER_OPTION_BOOL_FUNCTION(CheckProfilingEnable, PROFILING_MODE, "false", "true");
+
+ REGISTER_OPTION_HOOK(deliverswitch, [](const std::string& val) {
+   TORCH_CHECK(
+       CheckProfilingEnable(), 
+       "before you prepare to deliver op, ",
+       "you should be enture profiling mode is on correctly!");
+   if (val == "enable"){
+     at::native::npu::NpuProfilingDispatch::Instance().start();
+   } else {
+     at::native::npu::NpuProfilingDispatch::Instance().stop();
+   }
+ })
+
 } // namespace env
 } // namespace npu
 } // namespace native
diff --git a/src/aten/src/ATen/native/npu/interface/EnvVariables.h b/src/aten/src/ATen/native/npu/interface/EnvVariables.h
index d17617d41601bdd6a6229341a1af96c79193134f..bfb3c057e73c465e4e5b62ffc78fc740bc59de96 100644
--- a/src/aten/src/ATen/native/npu/interface/EnvVariables.h
+++ b/src/aten/src/ATen/native/npu/interface/EnvVariables.h
@@ -26,6 +26,8 @@ namespace env {
   */
 bool AutoTuneEnabled();
 bool CheckBmmV2Enable();
+bool CheckFuzzyEnable();
+bool CheckProfilingEnable();
 
 } // namespace env
 } // namespace npu
diff --git a/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossNpu.cpp b/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossKernelNpu.cpp
similarity index 40%
rename from src/aten/src/ATen/native/npu/loss/MultilabelMarginLossNpu.cpp
rename to src/aten/src/ATen/native/npu/loss/MultilabelMarginLossKernelNpu.cpp
index 3ba477702b576db92f76c8434fd97f0e8c17b2d3..be7343ee187bc74d87861ba17b529cc55a502154 100644
--- a/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossNpu.cpp
+++ b/src/aten/src/ATen/native/npu/loss/MultilabelMarginLossKernelNpu.cpp
@@ -12,50 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "ATen/native/npu/utils/OpAdapter.h"
 #include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
 
 namespace at {
 namespace native {
 using namespace at::native::npu;
 
-SmallVector<NPUTensorDesc, N> multilabel_margin_loss_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> multilabel_margin_loss_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> multilabel_margin_loss_npu_attr(int64_t reduction) {
-  string reductionStr;
-  if (reduction == Reduction::None) {
-    reductionStr = "none";
-  } else if (reduction == Reduction::Mean) {
-    reductionStr = "mean";
-  } else if (reduction == Reduction::Sum) {
-    reductionStr = "sum";
-  }
-
-  NPUAttrDesc npuAttrReduction = NPUAttrDesc("reduction", reductionStr);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrReduction};
-
-  return attrs;
-}
-
 std::tuple<Tensor&, Tensor&> multilabel_margin_loss_forward_out_npu(
     Tensor& output,
     Tensor& is_target,
     const Tensor& self,
     const Tensor& target,
     int64_t reduction) {
-  auto inputs = multilabel_margin_loss_npu_input({self, target});
-  auto outputs = multilabel_margin_loss_npu_output({output, is_target});
-  auto attrs = multilabel_margin_loss_npu_attr(reduction);
-  CalcuOpUtil::execute_npu_operate("MultilabelMarginLoss", inputs, outputs, attrs);
+
+  string reductionStr = CalcuOpUtil::get_reduction_str(reduction);
+  OpCommand cmd;
+  cmd.Name("MultilabelMarginLoss")
+    .Input(self)
+    .Input(target)
+    .Output(output)
+    .Output(is_target)
+    .Attr("reduction", reductionStr)
+    .Run();
+
   return std::tuple<Tensor&, Tensor&>(output, is_target);
 }
 
@@ -79,39 +59,13 @@ std::tuple<Tensor, Tensor> multilabel_margin_loss_forward_npu(
     outputSize = {nframe};
   }
 
-  auto output = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  auto is_target = at::empty_with_format(target.sizes(), target.options(), CalcuOpUtil::get_tensor_npu_format(target));  
+  auto output = OpPreparation::ApplyTensor(self, outputSize);
+  auto is_target = OpPreparation::ApplyTensor(target);
 
   multilabel_margin_loss_forward_out_npu(
       output, is_target, self, target, reduction);
   return std::make_tuple(output, is_target);
 }
 
-Tensor& multilabel_margin_loss_backward_npu_out(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& target,
-    int64_t reduction,
-    const Tensor& is_target) {
-  auto inputs = multilabel_margin_loss_npu_input({self, grad_output, target, is_target});
-  auto outputs = multilabel_margin_loss_npu_output({grad_input});
-  auto attrs = multilabel_margin_loss_npu_attr(reduction);
-  CalcuOpUtil::execute_npu_operate("MultilabelMarginLossGrad", inputs, outputs, attrs);
-  return grad_input;
-}
-
-Tensor multilabel_margin_loss_backward_npu(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& target,
-    int64_t reduction,
-    const Tensor& is_target) {
-  auto grad_input = at::empty_with_format(self.sizes(), self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  multilabel_margin_loss_backward_npu_out(
-      grad_input, grad_output, self, target, reduction, is_target);
-  return grad_input;
-}
-
 } // namespace native
 } // namespace at
diff --git a/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp
index fa7fb25a6ca3bde8f366294bfeacce0e3c10c8f3..9fd7906afa36fb80ea6784eac37706e4e97900af 100644
--- a/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/loss/NLLLoss2dKernelNpu.cpp
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -73,6 +74,7 @@ tuple<Tensor&, Tensor&> nll_loss2d_forward_out_npu(
       .Input(target)
       .Input(weight_tensor)
       .Attr("reduction", reductionStr)
+      .Attr("ignore_index", ignore_index)
       .Output(result)
       .Output(total_weight)
       .Run();
diff --git a/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp
index 7bddf995a685651b777df505a073375d8930a1d4..a1da5294f1173a07ad75a5b27c7d64e7f17d247e 100644
--- a/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/loss/NLLLossBackwardKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -52,8 +52,15 @@ Tensor& nll_loss_backward_out_npu(
   string reductionStr = CalcuOpUtil::get_reduction_str(reduction);
 
   Tensor targetCast = target;
-  if (target.scalar_type() == at::kLong || target.scalar_type() == at::kFloat) {
+  auto scalar_type = target.scalar_type();
+  if (scalar_type == at::kLong) {
     targetCast = target.to(at::kInt);
+  }  else if (scalar_type == at::kInt) {
+    ;
+  } 
+  else {
+    AT_ERROR("Expected object of scalar type ", at::kLong, " or ", at::kInt, " but got scalar type ", scalar_type,
+          " for argument 'target'  in call to nll_loss_backward");
   }
   
   OpCommand cmd;
diff --git a/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp b/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp
index 5268ef88356d433b8ed01ea3c208910a0b0a0b1c..10d4093cc378a8ed848ccb8dd277e71e06b5b3a9 100644
--- a/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/loss/NLLLossKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -51,9 +51,16 @@ tuple<Tensor&, Tensor&> nll_loss_forward_out_npu(
   string reductionStr = CalcuOpUtil::get_reduction_str(reduction);
 
   Tensor targetCast = target;
-  if (target.scalar_type() == at::kLong || target.scalar_type() == at::kFloat) {
+  auto scalar_type = target.scalar_type();
+  if (scalar_type == at::kLong) {
     targetCast = target.to(at::kInt);
-  }  
+  }  else if (scalar_type == at::kInt) {
+    ;
+  } 
+  else {
+    AT_ERROR("Expected object of scalar type ", at::kLong, " or ", at::kInt, " but got scalar type ", scalar_type,
+          " for argument 'target'  in call to nll_loss_forward");
+  }
 
   OpCommand cmd;
   cmd.Name("NLLLoss")
diff --git a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp
index 9682af4fce8a38e7ad5d090be8ab0ff7fb6bb6ee..4bf6a9838141bb14fd8c631f626cf1437a9b3602 100644
--- a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool2dBackwardKernelNpu.cpp
@@ -14,8 +14,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -58,16 +58,8 @@ Tensor& adaptive_avg_pool2d_backward_out_npu(
 Tensor adaptive_avg_pool2d_backward_npu(
     const Tensor& grad_output,
     const Tensor& self) {
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  // calculate the output result of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
   adaptive_avg_pool2d_backward_out_npu(result, grad_output, self);
-
   return result;
 }
 
diff --git a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp
index f37a1f2744533e668811a3128e427846b172379b..077f0ece2be6661a193a1789fa0851b48cb27469 100644
--- a/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/pooling/AdaptiveAvgPool3dBackwardKernelNpu.cpp
@@ -15,8 +15,8 @@
 // limitations under the License.
 
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 
 namespace at {
 namespace native {
@@ -51,12 +51,7 @@ Tensor& adaptive_avg_pool3d_backward_out_npu(
 }
 
 Tensor adaptive_avg_pool3d_backward_npu(const Tensor& grad_output, const Tensor& self){
-  // calcul the output size
-  auto outputsize = input_same_output_size(self);
-  
-  Tensor result = at::empty_with_format(
-      outputsize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  
+  Tensor result = OpPreparation::ApplyTensor(self);
   adaptive_avg_pool3d_backward_out_npu(result, grad_output, self);
   return result;
 }
diff --git a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp
index 9bf50adb1a311f8a642668477f80c053418ca368..aae01114039912bbbeb7f35270fc8362492c9264 100644
--- a/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/pooling/AvgPool2dKernelNpu.cpp
@@ -14,8 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
 
 namespace at {
 namespace native {
@@ -110,9 +109,7 @@ Tensor avg_pool2d_npu(
       divisor_override);
 
   // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSizes, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
+  Tensor result = OpPreparation::ApplyTensor(self, outputSizes);
   // calculate the output result of the NPU
   avg_pool2d_out_npu(
       result,
diff --git a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp
index a1374c3bf15a4c86f5742b4aeeb0f11a8ab06f18..5ffa6dc9e4327418977757fe22b474c563bb0df7 100644
--- a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesBackwardKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include <ATen/native/Pool.h>
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp
index 47e2bb0e091ef1e2157c07f5849293fb3533542f..a76bf972b96c1e2e258194fe48b9e6f86beb41b3 100644
--- a/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/pooling/MaxPool3dWithIndicesKernelNpu.cpp
@@ -15,6 +15,7 @@
 // limitations under the License.
 
 #include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
 #include <ATen/native/Pool.h>
 
 namespace at {
diff --git a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
index 33abf12e71ace29e2c19d3b5ac99df1b8002d19d..c5e2eba6ecc3bfdcad270781a578eb2eb01461e2 100644
--- a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
+++ b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
@@ -26,7 +26,7 @@
 #include "c10/npu/NPUCachingAllocator.h"
 #include "c10/npu/OptionsManager.h"
 #include "ATen/native/npu/utils/NpuFuzzyBlacklist.h"
-#include "ATen/native/GlobalStep.h"
+#include "ATen/native/npu/interface/EnvVariables.h"
 
 namespace at {
 namespace native {
@@ -602,7 +602,6 @@ void CalcuOpUtil::execute_npu_operate(
     SmallVector<NPUTensorDesc, N>& inputs,
     SmallVector<NPUTensorDesc, N>& outputs,
     const SmallVector<NPUAttrDesc, N>& attrs) {
-  NpuUtils::SetCompileOptOnce();
   if (c10::npu::OptionsManager::CheckQueueEnable() ||
       c10::npu::OptionsManager::CheckDynamicEnable()) {
     ExecuteParas cur_paras;
@@ -631,7 +630,7 @@ void CalcuOpUtil::execute_npu_operate(
   auto stream = c10::npu::getCurrentNPUStream();
   RECORD_FUNCTION(opName, std::vector<c10::IValue>({}));
   bool reset_flag = false;
-  if (check_fuzz_enable() &&
+  if (env::CheckFuzzyEnable() &&
       FuzzyCompileBlacklist::GetInstance().IsInBlacklist(opName)) {
     aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_DEFAULT);
     reset_flag = true;
diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
index 598c44b8715f34c3986fc2d8d9baf81931bf6fa9..150711feecf70d648b02167e36b80eb5f29d18a1 100644
--- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
+++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
@@ -200,23 +200,6 @@ SmallVector<int64_t, SIZE> cdist_npu_output_size(
   return output_shape;
 }
 
-SmallVector<int64_t, SIZE> conv_tbc_npu_output_size(
-    const Tensor& self,
-    const Tensor& weight,
-    const Tensor& bias,
-    int64_t pad) {
-  int64_t N = self.size(1);
-  int64_t H = 1;
-  int64_t W = self.size(0);
-  int64_t Co = weight.size(2);
-  int64_t Ho = 1;
-  int64_t Wo = (W + 2 * pad - (weight.size(0) - 1) - 1) + 1;
-
-  SmallVector<int64_t, SIZE> outputSize = {N, Co, Ho, Wo};
-
-  return outputSize;
-}
-
 tuple<IntArrayRef, IntArrayRef, SmallVector<int64_t, SIZE>>
 conv2d_backward_npu_output_size(
     const Tensor& input,
@@ -267,7 +250,7 @@ SmallVector<int64_t, SIZE> convolution_transpose_npu_output_size(
   int64_t N = input.size(0);
   int64_t H = input.size(2);
   int64_t W = input.size(3);
-  int64_t Co = weight.size(1);
+  int64_t Co = weight.size(1) * groups;
   auto kernel_size = weight.sizes().slice(2);
 
   int64_t Ho = (H - 1) * stride[0] - 2 * padding[0] +
@@ -327,12 +310,7 @@ ctc_loss_npu_output_size(
 
   SmallVector<int64_t, SIZE> negLogLikelihoodSize = {batchSize};
   
-  // tSize = 2*max(target_lengths)+1
-  int64_t maxLength = 0;
-  for(int i = 0; i < targetLengths.size(); i++) {
-    maxLength = targetLengths[i] > maxLength? targetLengths[i]: maxLength;
-  }
-  
+  int64_t maxLength = targets.size(1);
   int64_t tSize = 2 * maxLength + 1;  
   SmallVector<int64_t, SIZE> logAlphaSize = {batchSize, maxInputLength, tSize};
 
@@ -910,15 +888,6 @@ softmax_cross_entropy_with_logits_impl_npu_output_size(const Tensor& self) {
       resultSize, backpropSize);
 }
 
-tuple<SmallVector<int64_t, SIZE>, SmallVector<int64_t, SIZE>> std_npu_output_size(const Tensor & self, IntArrayRef dim, bool keepdim)
-{
-    SmallVector<int64_t, SIZE> outputSize;
-    SmallVector<int64_t, SIZE> meanSize;
-    outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
-    meanSize = reduce_ops_npu_output_size(self, dim, keepdim);
-    return tuple<SmallVector<int64_t, SIZE>, SmallVector<int64_t, SIZE>>(outputSize, meanSize);
-}
-
 SmallVector<int64_t, SIZE> sum_npu_output_size(
     const Tensor& self,
     IntArrayRef dim,
diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
index bf520b1ce13ebb32cc56d6dfcb3a06810ba118df..9290da7ddd91ee55d3e88cf46fc065973ab0a4be 100644
--- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
+++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
@@ -110,12 +110,6 @@ SmallVector<int64_t, SIZE> cdist_npu_output_size(
     const Tensor& x1,
     const Tensor& x2);
 
-SmallVector<int64_t, SIZE> conv_tbc_npu_output_size(
-    const Tensor& self,
-    const Tensor& weight,
-    const Tensor& bias,
-    int64_t pad);
-
 tuple<IntArrayRef, IntArrayRef, SmallVector<int64_t, SIZE>>
 conv2d_backward_npu_output_size(
     const Tensor& input,
@@ -352,9 +346,6 @@ SmallVector<int64_t, SIZE> transpose_npu_output_size(
 tuple<SmallVector<int64_t, SIZE>, SmallVector<int64_t, SIZE>>
 softmax_cross_entropy_with_logits_impl_npu_output_size(const Tensor& self);
 
-tuple<SmallVector<int64_t, SIZE>, SmallVector<int64_t, SIZE>> 
-std_npu_output_size(const Tensor & self, IntArrayRef dim, bool keepdim);
-
 SmallVector<int64_t, SIZE> sum_npu_output_size(
     const Tensor& self,
     IntArrayRef dim,
diff --git a/src/aten/src/ATen/native/GlobalStep.cpp b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.cpp
similarity index 32%
rename from src/aten/src/ATen/native/GlobalStep.cpp
rename to src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.cpp
index c757140616817917f7f92c5e305fa676adc4db95..a6a98f7aa32bfea206b0b77b526baf55139b7b91 100644
--- a/src/aten/src/ATen/native/GlobalStep.cpp
+++ b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.cpp
@@ -14,59 +14,55 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "GlobalStep.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include <third_party/acl/inc/acl/acl_op_compiler.h>
+#include "NpuProfilingDispatch.h"
+#include <c10/npu/NPUStream.h>
+#include <c10/npu/NPUException.h>
 
 namespace at {
 namespace native {
+namespace npu {
 
-GlobalStep& GlobalStep::Instance() {
-    static GlobalStep globalStep(0, 1);
-    return globalStep;
+NpuProfilingDispatch& NpuProfilingDispatch::Instance(){
+  static NpuProfilingDispatch npuProfilingDispatch;
+  return npuProfilingDispatch;
 }
 
-void GlobalStep::GlobalStepInc() {
-     GLOBAL_STEP++;
+void NpuProfilingDispatch::init(){
+    profStepInfo = c10::npu::acl::init_stepinfo();
 }
 
-int64_t GlobalStep::GetGlobalStep() const {
-    return GLOBAL_STEP;
+void NpuProfilingDispatch::start(){
+  this->init();
+  auto stream = c10::npu::getCurrentNPUStream();
+  auto ret = c10::npu::acl::start_deliver_op(
+      profStepInfo,
+      aclprofStepTag::ACL_STEP_START,
+      stream);
+  if(ret != ACL_ERROR_NONE){
+      NPU_LOGE("npu profiling start fail, error code: %d", ret);
+      C10_NPU_SHOW_ERR_MSG();
+  }
 }
 
-void GlobalStep::SetStartFuzzCompileStep(const int64_t step) {
-    START_FUZZ_COMPILE_STEP = step;
+void NpuProfilingDispatch::stop(){
+  auto stream = c10::npu::getCurrentNPUStream();
+  auto ret = c10::npu::acl::stop_deliver_op(
+      profStepInfo,
+      aclprofStepTag::ACL_STEP_END,
+      stream);
+  if(ret != ACL_ERROR_NONE){
+      NPU_LOGE("npu profiling stop fail, error code: %d", ret);
+      C10_NPU_SHOW_ERR_MSG();
+  }
+  this->destroy();
 }
 
-int64_t GlobalStep::GetStartFuzzCompileStep() const {
-    return START_FUZZ_COMPILE_STEP;
+void NpuProfilingDispatch::destroy(){
+  if(profStepInfo != nullptr){
+    c10::npu::acl::destroy_stepinfo(profStepInfo);
+  }
 }
 
-TORCH_NPU_API bool check_fuzz_enable(){
-    int64_t globalstep = GlobalStep::Instance().GetGlobalStep();
-    int64_t globalstartstep = GlobalStep::Instance().GetStartFuzzCompileStep();
-
-    return (globalstep >= globalstartstep);
-}
-
-void global_step_inc() {
-    #ifdef USE_NPU
-    GlobalStep::Instance().GlobalStepInc();
-    // To invoke the interface only once, check whether the GLOBAL_STEP equal to START_FUZZ_COMPILE_STEP is OK.
-    if(GlobalStep::Instance().GetGlobalStep() == GlobalStep::Instance().GetStartFuzzCompileStep()) {
-        NPU_LOGD("GLOBAL_STEP = %ld, START_FUZZ_COMPILE_STEP = %ld, start fuzz compile!", 
-        GlobalStep::Instance().GetGlobalStep(), GlobalStep::Instance().GetStartFuzzCompileStep());
-        
-        aclopSetCompileFlag(aclOpCompileFlag::ACL_OP_COMPILE_FUZZ);
-    }
-    #endif
 }
-
-void set_start_fuzz_compile_step(int64_t step) {
-    #ifdef USE_NPU
-    GlobalStep::Instance().SetStartFuzzCompileStep(step);
-    #endif
-}
-
 }
 }
diff --git a/src/aten/src/ATen/native/GlobalStep.h b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.h
similarity index 56%
rename from src/aten/src/ATen/native/GlobalStep.h
rename to src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.h
index ad38c6cf19b0ceff140e85647f08a1198efa1bcf..b455ad3704c5f90bdf8d1ead7d2d0a89768f81f8 100644
--- a/src/aten/src/ATen/native/GlobalStep.h
+++ b/src/aten/src/ATen/native/npu/utils/NpuProfilingDispatch.h
@@ -13,32 +13,32 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
 
-#include <cinttypes>
-#include <c10/macros/Export.h>
+#ifndef __NPU_PROFILING_DISPATCH__
+#define __NPU_PROFILING_DISPATCH__
+
+#include <c10/npu/interface/AclInterface.h>
 
 namespace at {
 namespace native {
+namespace npu {
 
-class GlobalStep
+class NpuProfilingDispatch
 {
- public:
-  static GlobalStep& Instance();
-  void GlobalStepInc();
-  int64_t GetGlobalStep() const;
-  void SetStartFuzzCompileStep(const int64_t step);
-  int64_t GetStartFuzzCompileStep() const;
-  ~GlobalStep() = default;
-
- private:  
-  int64_t GLOBAL_STEP;
-  int64_t START_FUZZ_COMPILE_STEP;
-  GlobalStep(int64_t globalstep, int64_t startstep) { 
-    GLOBAL_STEP = globalstep;
-    START_FUZZ_COMPILE_STEP = startstep; 
-  }
+  public:
+    static NpuProfilingDispatch& Instance();
+    void start();
+    void stop();
+  private:
+    aclprofStepInfo* profStepInfo = nullptr;
+    NpuProfilingDispatch() = default;
+    ~NpuProfilingDispatch() = default;
+    void init();
+    void destroy();
 };
-TORCH_NPU_API bool check_fuzz_enable();
+
 }
-}
\ No newline at end of file
+}
+}
+
+#endif // __NPU_PROFILING_DISPATCH__
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp
index c8379a959aa4a47fad610ee4795cbe3d92184ad7..daa10e3ab1c6e331dc08102e8d7894a3a2ba8f16 100644
--- a/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp
+++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.cpp
@@ -24,43 +24,12 @@
 #include "ATen/native/npu/frame/StorageDescHelper.h"
 #include "KernelNpuOutputSize.h"
 #include <ATen/native/npu/contiguous/ContiguousOpt.h>
-#include "NpuFuzzyBlacklist.h"
-#include "ATen/native/GlobalStep.h"
+#include "ATen/native/npu/interface/EnvVariables.h"
 #include <set>
 
 namespace at {
 namespace native {
 namespace npu {
-namespace{
-  std::once_flag CompileOptOnceFlag;
-}
-
-void NpuUtils::SetCompileOptOnce() {
-  std::call_once(CompileOptOnceFlag, [](){
-    static std::map<const string, const aclCompileOpt> STRING_TYPE_TO_ACL_COMPILE_OPTION_MAP = {
-        {"ACL_OP_DEBUG_LEVEL", ACL_OP_DEBUG_LEVEL},
-        {"ACL_DEBUG_DIR", ACL_DEBUG_DIR},
-        {"ACL_OP_COMPILER_CACHE_MODE", ACL_OP_COMPILER_CACHE_MODE},
-        {"ACL_OP_COMPILER_CACHE_DIR", ACL_OP_COMPILER_CACHE_DIR},
-    };
-    for (const auto &iter : STRING_TYPE_TO_ACL_COMPILE_OPTION_MAP) {
-      auto key = iter.second;
-      auto val = c10::npu::GetOption(iter.first);
-      if (val.has_value()) {
-        aclSetCompileopt(key, val.value().c_str());
-      }
-    }
-    static std::set<std::string> STRING_COMPILE_OPTION_SET = {
-        {"NPU_FUZZY_COMPILE_BLACKLIST"},
-    };
-    for (const auto &iter : STRING_COMPILE_OPTION_SET) {
-      auto val = c10::npu::GetOption(iter);
-      if(check_fuzz_enable() && val.has_value())
-        FuzzyCompileBlacklist::GetInstance().RegisterBlacklist(val.value());
-    }
-    
-  });
-}
 
 void NpuUtils::format_fresh_view(
     Tensor& x,
diff --git a/src/aten/src/ATen/native/npu/utils/NpuUtils.h b/src/aten/src/ATen/native/npu/utils/NpuUtils.h
index dc33b853df3daf2b37d89551fa7ba8d770696c8c..d9797e289977defac21ded7f2ed0793debf6ec5c 100644
--- a/src/aten/src/ATen/native/npu/utils/NpuUtils.h
+++ b/src/aten/src/ATen/native/npu/utils/NpuUtils.h
@@ -49,10 +49,7 @@ typedef enum MemoryType{
 
 class NpuUtils {
  public:
-  /**
-    This API is used to set compile option.
-    */
-  CAFFE2_API static void SetCompileOptOnce();
+
   static bool check_match(const Tensor* tensor);
   static Tensor format_contiguous(const Tensor& src);
   static Tensor format_contiguous_add_copy_optimize(const Tensor& src);
diff --git a/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h b/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h
index 8e0fde12e4496af33224ad177d39dbd6183b9316..689143c8ee5ab64579de6a1696bb622b82d85bdd 100644
--- a/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h
+++ b/src/aten/src/ATen/native/npu/utils/OpPipeWithMultiOut.h
@@ -116,8 +116,7 @@ class OpPipeWithMultiOut {
     OpPreparation::CheckOut(
         inputs,
         std::get<index>(this->funcParams),
-        CalcuOpUtil::get_tensor_npu_format(src),
-        src.scalar_type(),
+        src,
         size);
     return *this;
   }
diff --git a/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp b/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp
index 2c6cd8d666dc2f640d43e6113a3a284384449a7a..0d256094c7c169edacb7851c5f0d23dbeab0f8a0 100644
--- a/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp
+++ b/src/aten/src/ATen/native/npu/utils/OpPreparation.cpp
@@ -110,6 +110,19 @@ void OpPreparation::CheckOut(
       dst.sizes());
 }
 
+void OpPreparation::CheckOut(
+    const std::initializer_list<Tensor>& inputs,
+    Tensor& output,
+    Tensor dst,
+    IntArrayRef shape) {
+  CheckOut(
+      inputs,
+      output, 
+      CalcuOpUtil::get_tensor_npu_format(dst),
+      dst.scalar_type(),
+      shape);
+}
+
 void OpPreparation::CheckOut(
     const std::initializer_list<Tensor>& input,
     Tensor& output,
diff --git a/src/aten/src/ATen/native/npu/utils/OpPreparation.h b/src/aten/src/ATen/native/npu/utils/OpPreparation.h
index 0f11af4270fa19d39839197f106a2f611c511708..02bc66f3f322dc56f066c26cc8d8dc0cd7de9d84 100644
--- a/src/aten/src/ATen/native/npu/utils/OpPreparation.h
+++ b/src/aten/src/ATen/native/npu/utils/OpPreparation.h
@@ -54,6 +54,10 @@ public:
   static void CheckOut(
       const std::initializer_list<Tensor>& inputs,
       Tensor& output, Tensor dst);
+  static void CheckOut(
+      const std::initializer_list<Tensor>& inputs,
+      Tensor& output, Tensor dst,
+      IntArrayRef shape);
   static void CheckOut(
       const std::initializer_list<Tensor>& input,
       Tensor& output, int64_t format,
diff --git a/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp b/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp
index a57145af965ed1ac6ac919c745918899bb1b2896..cd80ef4b050b00216d704c3ce228c138f3c3d0bf 100644
--- a/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp
+++ b/src/aten/src/ATen/native/npu/utils/OpTemplate.cpp
@@ -14,7 +14,7 @@
 // limitations under the License.
 
 #include "OpTemplate.h"
-#include "ATen/native/GlobalStep.h"
+#include "ATen/native/npu/interface/EnvVariables.h"
 #include "ATen/native/npu/frame/OpCmdHelper.h"
 #include "ATen/native/npu/frame/FormatHelper.h"
 #include "ATen/native/npu/frame/OpParamMaker.h"
@@ -36,7 +36,7 @@ TransDataOpCommand& TransDataOpCommand::AddInputAndOutput(const Tensor& input, c
   std::tuple<aclTensorDesc*, aclDataBuffer*, int64_t, aclFormat> in;
   std::tuple<aclTensorDesc*, aclDataBuffer*, int64_t, aclFormat> out;
 
-   if (!c10::npu::OptionsManager::CheckDynamicEnable() && check_fuzz_enable()) {
+   if (!c10::npu::OptionsManager::CheckDynamicEnable() && env::CheckFuzzyEnable()) {
     in = OpCmdHelper::CovertTensorToAclInput(input, c10::nullopt, "", "");
     out = OpCmdHelper::CovertTensorToAclInput(output, c10::nullopt, "", "");
   } else {
diff --git a/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp b/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp
index e2c9a5d18620057b793c33c318378194351d54a1..c7846701dfa81788ea3236bdbd90e1942ff3cca4 100644
--- a/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp
+++ b/src/aten/src/THNPU/THNPUCachingHostAllocator.cpp
@@ -171,13 +171,17 @@ struct HostAllocator {
     while (!npu_events.empty()) {
       auto& e = npu_events.front();
       aclrtEvent event = e.first;
-      aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-      aclError err = aclrtQueryEvent(event, &status);
-      if (status == ACL_EVENT_STATUS_NOT_READY) {
+      c10::npu::acl::aclrtEventWaitStatus waitStatus = c10::npu::acl::ACL_EVENT_WAIT_STATUS_RESERVED;
+      aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED;
+      aclError err = c10::npu::acl::AclQueryEventStatus(event, &waitStatus, &recordStatus);
+      if (err != ACL_ERROR_NONE) {
+          return err;
+      }
+      if ((waitStatus != c10::npu::acl::ACL_EVENT_WAIT_STATUS_COMPLETE) &&
+        (recordStatus != ACL_EVENT_STATUS_COMPLETE)) {
         break;
-      } else if (err != ACL_ERROR_NONE) {
-        return err;
       }
+
       err = aclrtDestroyEvent(event);
       if (err != ACL_ERROR_NONE) {
         return err;
diff --git a/src/c10/npu/NPUCachingAllocator.cpp b/src/c10/npu/NPUCachingAllocator.cpp
index a9f6cc911a4a2a2f554b14127e2019d21c094508..f179b6b23d50fbe7d6e31cc2c8109259305d78e9 100644
--- a/src/c10/npu/NPUCachingAllocator.cpp
+++ b/src/c10/npu/NPUCachingAllocator.cpp
@@ -891,13 +891,15 @@ struct THNCachingAllocator {
       aclrtEvent event = e.first;
       Block* block = e.second;
 
-      aclrtEventStatus status;
-      aclError err = aclrtQueryEvent(event, &status);
-      if (status == ACL_EVENT_STATUS_NOT_READY) {
-        // ignore if not ready
+      acl::aclrtEventWaitStatus waitStatus = acl::ACL_EVENT_WAIT_STATUS_RESERVED;
+      aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED;
+      aclError err = acl::AclQueryEventStatus(event, &waitStatus, &recordStatus);
+      if (err != ACL_ERROR_NONE) {
+           C10_NPU_CHECK(err);
+      }
+      if ((waitStatus != acl::ACL_EVENT_WAIT_STATUS_COMPLETE) &&
+        (recordStatus != ACL_EVENT_STATUS_COMPLETE)) {
         break;
-      } else if (err != ACL_ERROR_NONE) {
-        C10_NPU_CHECK(err);
       }
 
       aclrtDestroyEvent(event);
diff --git a/src/c10/npu/NPUEventManager.cpp b/src/c10/npu/NPUEventManager.cpp
index a59cb2f95cf332aae1eb52321c789aafce5674e1..d847472eb1a926f90151d23763dd3f3b8d13aba0 100644
--- a/src/c10/npu/NPUEventManager.cpp
+++ b/src/c10/npu/NPUEventManager.cpp
@@ -28,13 +28,16 @@ aclError NPUEventManager::LazyDestroy(aclrtEvent npu_event) {
   while (!npu_events_.empty())
   {
     aclrtEvent event = npu_events_.front();
-    aclrtEventStatus status;
-    aclError err = aclrtQueryEvent(event, &status);
-    if (status != ACL_EVENT_STATUS_COMPLETE) {
-      break;
-    } else if (err != ACL_ERROR_NONE) {
+    acl::aclrtEventWaitStatus waitStatus = acl::ACL_EVENT_WAIT_STATUS_RESERVED;
+    aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED;
+    aclError err = acl::AclQueryEventStatus(event, &waitStatus, &recordStatus);
+    if (err != ACL_ERROR_NONE) {
         return err;
     }
+    if ((waitStatus != acl::ACL_EVENT_WAIT_STATUS_COMPLETE) &&
+      (recordStatus != ACL_EVENT_STATUS_COMPLETE)) {
+      break;
+    }
     err = aclrtDestroyEvent(event);
     if (err != ACL_ERROR_NONE) {
         return err;
diff --git a/src/c10/npu/NPUStream.cpp b/src/c10/npu/NPUStream.cpp
index baa1ebf6c510ce6883abc5b35227742f18e3606d..d9a54eb713851da95428a96eee2fcbc0897fbbff 100644
--- a/src/c10/npu/NPUStream.cpp
+++ b/src/c10/npu/NPUStream.cpp
@@ -63,7 +63,7 @@ struct LeakyStreamInternals {
 
 // Global stream state and constants
 static DeviceIndex num_npus = -1;
-static constexpr int kStreamsPerPoolBits = 5;
+static constexpr int kStreamsPerPoolBits = 3;
 static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
 // static constexpr unsigned int kDefaultFlags = npuStreamNonBlocking;
 
diff --git a/src/c10/npu/OptionsManager.h b/src/c10/npu/OptionsManager.h
index 62e3e156290361d9415d4aed10a845125db7fdf4..348f63b4355a5430c4c87684dfdc675e2612587e 100644
--- a/src/c10/npu/OptionsManager.h
+++ b/src/c10/npu/OptionsManager.h
@@ -38,7 +38,6 @@ class OptionsManager {
   static bool CheckUseNpuLogEnable();
   static bool CheckDynamicOnly();
   static std::string CheckDisableDynamicPath();
-
  private:
   static int GetBoolTypeOption(const char* env_str);
 };
diff --git a/src/c10/npu/interface/AclInterface.cpp b/src/c10/npu/interface/AclInterface.cpp
index 3121ae3f625a0a0e605392991da8d0b37973d347..84152a66361bf618980524c5825a3ac1016df709 100644
--- a/src/c10/npu/interface/AclInterface.cpp
+++ b/src/c10/npu/interface/AclInterface.cpp
@@ -17,7 +17,6 @@
 #include "AclInterface.h"
 #include "c10/npu/register/FunctionLoader.h"
 #include "c10/util/Exception.h"
-#include <iostream>
 
 namespace c10 {
 namespace npu {
@@ -32,6 +31,54 @@ namespace acl {
 REGISTER_LIBRARY(libascendcl)
 LOAD_FUNCTION(aclGetRecentErrMsg)
 LOAD_FUNCTION(aclrtCreateEventWithFlag)
+LOAD_FUNCTION(aclrtQueryEventWaitStatus)
+LOAD_FUNCTION(aclprofCreateStepInfo)
+LOAD_FUNCTION(aclprofGetStepTimestamp)
+LOAD_FUNCTION(aclprofDestroyStepInfo)
+
+aclprofStepInfoPtr init_stepinfo(){
+  typedef aclprofStepInfoPtr(*npdInitFunc)();
+  static npdInitFunc func = nullptr;
+  if(func == nullptr){
+      func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo);
+  }
+  TORCH_CHECK(func, "Failed to find function ", "aclprofCreateStepInfo");
+  auto ret = func();
+  return ret;
+}
+
+NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo){
+  typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr);
+  static npdDestroyFunc func = nullptr;
+  if(func == nullptr){
+      func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo);
+  }
+  TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyStepInfo");
+  auto ret = func(stepInfo);
+  return ret;
+}
+
+NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream){
+  typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+  static npdStartProfiling func = nullptr;
+  if(func == nullptr){
+      func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp);
+  }
+  TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp");
+  auto ret = func(stepInfo, stepTag, stream);
+  return ret;
+}
+
+NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream){
+  typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+  static npdStopProfiling func = nullptr;
+  if(func == nullptr){
+      func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp);
+  }
+  TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp");
+  auto ret = func(stepInfo, stepTag, stream);
+  return ret;
+}
 
 const char *AclGetErrMsg()
 {
@@ -56,6 +103,19 @@ aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag) {
   return func(event, flag);
 }
 
+aclError AclQueryEventStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus, aclrtEventStatus *recordStatus)
+{
+  typedef aclError (*aclQueryEventWaitStatus)(aclrtEvent event, aclrtEventWaitStatus *status);
+  static aclQueryEventWaitStatus func = nullptr;
+  if (func == nullptr) {
+    func = (aclQueryEventWaitStatus)GET_FUNC(aclrtQueryEventWaitStatus);
+  }
+  if (func != nullptr) {
+    return func(event, waitStatus);
+  } else {
+    return aclrtQueryEvent(event, recordStatus);
+  }
+}
 } // namespace acl
 } // namespace npu
 } // namespace c10
diff --git a/src/c10/npu/interface/AclInterface.h b/src/c10/npu/interface/AclInterface.h
index 9a8a19549752498a98dffe6c3ff24934bb2daef0..7a9a5a4f532ef36d57ab120fd80d82a15093d4ed 100644
--- a/src/c10/npu/interface/AclInterface.h
+++ b/src/c10/npu/interface/AclInterface.h
@@ -12,11 +12,47 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <third_party/acl/inc/acl/acl.h>
+
+#ifndef __C10_NPU_INTERFACE_ACLINTERFACE__
+#define __C10_NPU_INTERFACE_ACLINTERFACE__
+
+#include "third_party/acl/inc/acl/acl_rt.h"
+#include <third_party/acl/inc/acl/acl_base.h>
 
 namespace c10 {
 namespace npu {
 namespace acl {
+typedef enum aclrtEventWaitStatus {
+    ACL_EVENT_WAIT_STATUS_COMPLETE  = 0,
+    ACL_EVENT_WAIT_STATUS_NOT_READY = 1,
+    ACL_EVENT_WAIT_STATUS_RESERVED  = 0xffff,
+} aclrtEventWaitStatus;
+
+/**
+  aclprofStepInfo is provide by acl, it used to be store dispatch op info.
+ */
+using aclprofStepInfoPtr = aclprofStepInfo *;
+/**
+ NpdStatus is provide by acl, it used to store the return value.
+ */
+using NpdStatus = int;
+
+/** 
+  This Api is used to init npd, it need to be called once at process.
+ */
+aclprofStepInfoPtr init_stepinfo();
+/** 
+  This Api is used to destroy npd, it need to be called once at process.
+ */
+NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo);
+/** 
+  This Api is used to start dispatch op, this operation should be called after init.
+ */
+NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream);
+/** 
+  This Api is used to stop dispatch op, this operation should be called after start dispatch op.
+ */
+NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream);
 
 /**
   This API is used to get error msg
@@ -33,6 +69,13 @@ const char *AclGetErrMsg();
  * @retval OtherValues Failure
  */
 aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag);
+
+/**
+  This API is used to query status of event task
+  */
+aclError AclQueryEventStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus, aclrtEventStatus *recordStatus);
 } // namespace acl
 } // namespace npu
-} // namespace c10
\ No newline at end of file
+} // namespace c10
+
+#endif // __C10_NPU_INTERFACE_ACLINTERFACE__
\ No newline at end of file
diff --git a/src/c10/npu/register/FunctionLoader.cpp b/src/c10/npu/register/FunctionLoader.cpp
index d2dd63ce6122b897489ec0577b530b5e930b8016..0732476ec354ab1dfd5e35fd514e7f69ebb18b9c 100644
--- a/src/c10/npu/register/FunctionLoader.cpp
+++ b/src/c10/npu/register/FunctionLoader.cpp
@@ -57,7 +57,6 @@ void* FunctionLoader::Get(const std::string& name) {
 
   auto func = dlsym(this->handle, name.c_str());
   if (func == nullptr) {
-    AT_ERROR(dlerror());
     return nullptr;
   }
   this->registry[name] = func;
diff --git a/src/third_party/acl/inc/acl/acl.h b/src/third_party/acl/inc/acl/acl.h
index 50ebd624e47ddcc67d6d2b2d9edbdb0c5e9b59f8..41db19178ba11228eecf3fc993e7353b9e6fbb1f 100644
--- a/src/third_party/acl/inc/acl/acl.h
+++ b/src/third_party/acl/inc/acl/acl.h
@@ -60,15 +60,6 @@ ACL_FUNC_VISIBILITY aclError aclFinalize();
  */
 ACL_FUNC_VISIBILITY aclError aclrtGetVersion(int32_t *majorVersion, int32_t *minorVersion, int32_t *patchVersion);
 
-/**
- * @ingroup AscendCL
- * @brief get recent error message
- *
- * @retval null for failed
- * @retval OtherValues success
-*/
-ACL_FUNC_VISIBILITY const char *aclGetRecentErrMsg();
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/third_party/acl/inc/acl/acl_base.h b/src/third_party/acl/inc/acl/acl_base.h
index 7509784c9875ec4792cddc53b9a9680ec1872416..f7a77000f5d0554d6f804fabe5adce3bc64924d4 100644
--- a/src/third_party/acl/inc/acl/acl_base.h
+++ b/src/third_party/acl/inc/acl/acl_base.h
@@ -31,7 +31,7 @@ typedef int aclError;
 typedef uint16_t aclFloat16;
 typedef struct aclDataBuffer aclDataBuffer;
 typedef struct aclTensorDesc aclTensorDesc;
-
+typedef struct aclprofStepInfo aclprofStepInfo;
 static const int ACL_ERROR_NONE = 0;
 
 static const int ACL_ERROR_INVALID_PARAM = 100000;
@@ -146,6 +146,11 @@ typedef enum {
     ACL_MEMTYPE_HOST = 1,   
 } aclMemType;
 
+typedef enum {
+    ACL_STEP_START = 0,
+    ACL_STEP_END = 1,
+} aclprofStepTag;
+
 /**
  * @ingroup AscendCL
  * @brief Converts data of type aclFloat16 to data of type float
@@ -498,7 +503,6 @@ ACL_FUNC_VISIBILITY void aclAppLog(aclLogLevel logLevel, const char *func, const
     const char *fmt, ...);
 
 
-
 ACL_FUNC_VISIBILITY aclError aclSetTensorPlaceMent(aclTensorDesc *desc, aclMemType type);
 
 #define ACL_APP_LOG(level, fmt, ...) \
diff --git a/src/third_party/acl/libs/acl.cpp b/src/third_party/acl/libs/acl.cpp
index cf652f7e9ae06a82082de37a240f0b8ca95bbcae..e517148e2507aeb0620a061b26a59a2981150a2e 100644
--- a/src/third_party/acl/libs/acl.cpp
+++ b/src/third_party/acl/libs/acl.cpp
@@ -67,6 +67,4 @@ aclFormat aclGetTensorDescFormat(const aclTensorDesc *desc) {return ACL_FORMAT_N
 const char *aclGetTensorDescName(aclTensorDesc *desc) {return NULL;}
 
 aclError aclSetTensorPlaceMent(aclTensorDesc *desc, aclMemType type) {return 0;};
-
-const char *aclGetRecentErrMsg() {return NULL;}
 }
diff --git a/src/tools/autograd/derivatives.yaml b/src/tools/autograd/derivatives.yaml
index 90f11a38cf7c465cfcbabad70120225e7c2bda2f..046aad5032c2ef0e38c53ab8b859e2703fd5cf9d 100644
--- a/src/tools/autograd/derivatives.yaml
+++ b/src/tools/autograd/derivatives.yaml
@@ -1679,9 +1679,9 @@
 - name: npu_confusion_transpose(Tensor self, int[] perm, int[] shape, bool transpose_first) -> Tensor
   self: npu_confusion_transpose_backward(grad, perm, self.sizes(), !transpose_first)
 
-- name: npu_bmmV2(Tensor self, Tensor mat2) -> Tensor
-  self: grad.npu_bmmV2(mat2.transpose(-2, -1))
-  mat2: npu_bmmV2_mat2_backward(grad, self, mat2.sizes())
+- name: npu_bmmV2(Tensor self, Tensor mat2, int[] output_sizes) -> Tensor
+  self: npu_bmm_v2_mat1_backward(grad, self, mat2, self.sizes())
+  mat2: npu_bmm_v2_mat2_backward(grad, self, mat2, mat2.sizes())
 
 - name: npu_deformable_conv2d(Tensor input, Tensor weight, Tensor offset, Tensor? bias, int[2] kernel_size, int[] stride, int[] padding, int[] dilation=[1,1,1,1], int groups=1, int deformable_groups=1, bool modulated=True) -> (Tensor, Tensor)
   input, weight, offset, bias: npu_deformable_conv2dbk(input, grad, result1, weight, offset, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated)
diff --git a/src/torch/csrc/autograd/profiler_npu.cpp b/src/torch/csrc/autograd/profiler_npu.cpp
index 150a68fc40d5a09885530783dc8db2d60070f9b5..27a465ad6b9ea1ffa8290d59eaca5fdf53e3b297 100644
--- a/src/torch/csrc/autograd/profiler_npu.cpp
+++ b/src/torch/csrc/autograd/profiler_npu.cpp
@@ -36,9 +36,12 @@ static inline void npuCheck(aclError result, const char * file, int line) {
 
 struct NPUMethods : public CUDAStubs {
   void npu_destroy_event(aclrtEvent event) {
-    aclrtEventStatus status;
-    TORCH_NPU_CHECK(aclrtQueryEvent(event, &status));
-    if (status == ACL_EVENT_STATUS_COMPLETE) {
+    c10::npu::acl::aclrtEventWaitStatus waitStatus = c10::npu::acl::ACL_EVENT_WAIT_STATUS_RESERVED;
+    aclrtEventStatus recordStatus = ACL_EVENT_STATUS_RESERVED;
+    TORCH_NPU_CHECK(c10::npu::acl::AclQueryEventStatus(event, &waitStatus, &recordStatus));
+
+    if ((waitStatus == c10::npu::acl::ACL_EVENT_WAIT_STATUS_COMPLETE) ||
+      (recordStatus == ACL_EVENT_STATUS_COMPLETE)) {
         TORCH_NPU_CHECK(aclrtDestroyEvent(event));
     } else {
         std::cout << "Warning! NPU destroy event error, status is not completed." << std::endl;
diff --git a/src/torch/npu/__init__.py b/src/torch/npu/__init__.py
index 2a9d5c79f5d6d21240257ec2e86d54c817f1682a..0a494be8c749b378033e716bd684237543e82fee 100644
--- a/src/torch/npu/__init__.py
+++ b/src/torch/npu/__init__.py
@@ -161,14 +161,6 @@ def is_available():
         return False
     return device_count() > 0
 
-def set_option(option):
-    if not isinstance(option, dict):
-        raise TypeError("npu option must be a dict.")
-
-    for option_name, option_value in option.items():
-        option[option_name] = str(option_value)
-
-    torch._C._npu_setOption(option)
 
 class device(object):
     r"""Context-manager that changes the selected device.
@@ -288,28 +280,7 @@ if not hasattr(torch._C, '_NPUStreamBase'):
     torch._C.__dict__['_NPUStreamBase'] = _dummy_type('NPUStreamBase')
     torch._C.__dict__['_NPUEventBase'] = _dummy_type('NPUEventBase')
 
-
-def init_dump():
-    _lazy_init()
-    option = {}
-    option["mdldumpswitch"] = "init"
-    torch._C._npu_setOption(option)
-
-def set_dump(cfg_file):
-    if not os.path.exists(cfg_file):
-        raise AssertionError("cfg_file %s path not exists."%(cfg_file))
-    cfg_file = os.path.abspath(cfg_file)
-    _lazy_init()
-    option = {}
-    option["mdldumpconfigpath"] = cfg_file
-    torch._C._npu_setOption(option)
-
-def finalize_dump():
-    _lazy_init()
-    option = {}
-    option["mdldumpswitch"] = "finalize"
-    torch._C._npu_setOption(option)
-
 from .memory import *
 
-from .streams import Stream, Event
\ No newline at end of file
+from .streams import Stream, Event
+from .npu_frontend_enhance import *
\ No newline at end of file
diff --git a/src/torch/npu/npu_frontend_enhance.py b/src/torch/npu/npu_frontend_enhance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8e410fbe0aeb5ea65572770cad090892c151255
--- /dev/null
+++ b/src/torch/npu/npu_frontend_enhance.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch._C
+import os
+# this file is used to enhance the npu frontend API by set_option or other.
+
+__all__ = ["set_option", "set_dump", "init_dump", "finalize_dump", "global_step_inc", "set_start_fuzz_compile_step", 
+           "iteration_start", "iteration_end"]
+
+def set_option(option):
+    if not isinstance(option, dict):
+        raise TypeError("npu option must be a dict.")
+
+    for option_name, option_value in option.items():
+        option[option_name] = str(option_value)
+
+    torch._C._npu_setOption(option)
+
+def init_dump():
+    option = {"mdldumpswitch":"enable"}
+    torch._C._npu_setOption(option)
+
+def set_dump(cfg_file):
+    if not os.path.exists(cfg_file):
+        raise AssertionError("cfg_file %s path not exists."%(cfg_file))
+    cfg_file = os.path.abspath(cfg_file)
+    option = {"mdldumpconfigpath": cfg_file}
+    torch._C._npu_setOption(option)
+
+def finalize_dump():
+    option = {"mdldumpswitch": "disable"}
+    torch._C._npu_setOption(option)
+
+def iteration_start():
+    option = {"deliverswitch": "enable"}
+    torch._C._npu_setOption(option)
+
+def iteration_end():
+    option = {"deliverswitch": "disable"}
+    torch._C._npu_setOption(option)
+
+_GLOBAL_STEP=0
+_START_FUZZ_COMPILE_STEP=1
+def global_step_inc():
+    global _GLOBAL_STEP
+    _GLOBAL_STEP += 1
+
+    option = {"fuzzycompileswitch": "enable" if _GLOBAL_STEP >= _START_FUZZ_COMPILE_STEP \
+        else "disable"}
+    torch._C._npu_setOption(option)
+
+def set_start_fuzz_compile_step(step):
+    if not isinstance(step, int):
+        raise TypeError("step must be a int, but got ", type(step))
+    
+    global _START_FUZZ_COMPILE_STEP
+    _START_FUZZ_COMPILE_STEP = step
+    option = {"fuzzycompileswitch": "disable"}
+    torch._C._npu_setOption(option)
\ No newline at end of file
diff --git a/test/test_npu/test_conv_tbc_backward.py b/test/test_npu/test_conv_tbc_backward.py
index 8297bb2ea77411139be924e8b66028f1e080f073..3032ad485915862fb74cbb6f136456e562c67bee 100644
--- a/test/test_npu/test_conv_tbc_backward.py
+++ b/test/test_npu/test_conv_tbc_backward.py
@@ -97,5 +97,4 @@ class TestConvTbcBackward(TestCase):
 
 instantiate_device_type_tests(TestConvTbcBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
diff --git a/test/test_npu/test_instance_norm.py b/test/test_npu/test_instance_norm.py
deleted file mode 100644
index f31f8f6ef5fe312f600ddb75b9b5e97585c454f2..0000000000000000000000000000000000000000
--- a/test/test_npu/test_instance_norm.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestIn2d(TestCase):
-    def cpu_op_exec(self, input1, weight, cpu_bias, cpu_running_mean, cpu_running_var, use_input_stats, momentum, epsilon):
-        output = torch.instance_norm(input1, weight, cpu_bias, cpu_running_mean, cpu_running_var, use_input_stats, momentum, epsilon, cudnn_enabled = False)
-        return output.numpy()
-
-    def npu_op_exec(self, input1, weight, npu_bias, npu_running_mean, npu_running_var, use_input_stats, momentum, epsilon):
-        output = torch.instance_norm(input1, weight, npu_bias, npu_running_mean, npu_running_var, use_input_stats, momentum, epsilon, cudnn_enabled = False)
-        output = output.to("cpu")
-        return output.numpy()
-
-    def test_instance_norm_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, (2, 20, 8, 10)], [np.float32, 0,  (20)], [np.float32, 0,  (20)], [np.float32, 0,  (20)], [np.float32,  0, (20)], False, 0.1, 0.0001],
-            [[np.float32, 0, (2, 8, 10, 7)], [np.float32, 0,  (8)], [np.float32, 0,  (8)], [np.float32, 0,  (8)], [np.float32,  0, (8)], True, 0.1, 0.0001],
-            [[np.float32, 0, (2, 10, 20)], [np.float32, -1, (10,)], [np.float32, -1, (10,)],[np.float32, -1, (10,)], [np.float32, -1, (10,)], True, 0.1, 0.0001],
-            [[np.float32, 3, (6,  20, 2, 3)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], False, 0.1, 0.0001],
-            [[np.float32, 3, (6,  20, 2, 3)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], [np.float32, 3, (20)], True, 0.1, 0.0001],
-            [[np.float32, 3, (2, 2, 2, 2)], [np.float32, -1, (2,)], [np.float32, -1, (2,)],[np.float32, -1, (2,)], [np.float32, -1, (2,)], True, 0.1, 0.0001]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 20)
-            cpu_input_weight, npu_input_weight = create_common_tensor(item[1],  1, 10)
-            cpu_bias, npu_bias = create_common_tensor(item[2], 1, 10)
-            cpu_running_mean, npu_running_mean = create_common_tensor(item[3], 1, 10)
-            cpu_running_var, npu_running_var = create_common_tensor(item[4], 1, 10)
-            cpu_result = self.cpu_op_exec(cpu_input, cpu_input_weight, cpu_bias, cpu_running_mean, cpu_running_var, item[5], item[6], item[7])
-            npu_result = self.npu_op_exec(npu_input, npu_input_weight, npu_bias, npu_running_mean, npu_running_var, item[5], item[6], item[7])
-            self.assertRtolEqual(cpu_result, npu_result)
-
-    def test_instance_norm_fp16_shape_format(self, device):
-        shape_format = [
-            [[np.float16, 0, (2, 15, 4, 2)], [np.float16, 0,  (15)], [np.float16, 0,  (15)], [np.float16, 0,  (15)], [np.float16,  0, (15)], False, 0.1, 0.0001],
-            [[np.float16, 0, (2, 30, 4, 2)], [np.float16, 0,  (30)], [np.float16, 0,  (30)], [np.float16, 0,  (30)], [np.float16,  0, (30)], True, 0.1, 0.0001],
-            [[np.float16, 0, (2, 10, 20)], [np.float16, -1, (10,)], [np.float16, -1, (10,)],[np.float16, -1, (10,)], [np.float16, -1, (10,)], True, 0.1, 0.0001],
-            [[np.float16, 3, (6,  20, 2, 3)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], False, 0.1, 0.0001],
-            [[np.float16, 3, (6,  20, 2, 3)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], [np.float16, 3, (20)], True, 0.1, 0.0001],
-            [[np.float16, 3, (2, 2, 2, 2)], [np.float16, -1, (2,)], [np.float16, -1, (2,)],[np.float16, -1, (2,)], [np.float16, -1, (2,)], True, 0.1, 0.0001]
-        ]
-        def cpu_op_fp16_exec(input1,
-                             weight,
-                             cpu_bias,
-                             cpu_running_mean,
-                             cpu_running_var,
-                             use_input_stats,
-                             momentum,
-                             epsilon):
-            input1 = input1.to(torch.float32)
-            weight = weight.to(torch.float32)
-            cpu_bias = cpu_bias.to(torch.float32)
-            cpu_running_mean = cpu_running_mean.to(torch.float32)
-            cpu_running_var = cpu_running_var.to(torch.float32)
-
-            output = torch.instance_norm(input1,
-                                         weight,
-                                         cpu_bias,
-                                         cpu_running_mean,
-                                         cpu_running_var,
-                                         use_input_stats,
-                                         momentum,
-                                         epsilon,
-                                         cudnn_enabled = False)
-            output = output.numpy()
-            return output.astype(np.float16)
-
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
-            cpu_input_weight, npu_input_weight = create_common_tensor(item[1],  1, 10)
-            cpu_bias, npu_bias = create_common_tensor(item[2], 1, 10)
-            cpu_running_mean, npu_running_mean = create_common_tensor(item[3], 1, 10)
-            cpu_running_var, npu_running_var = create_common_tensor(item[4], 1, 10)
-            cpu_result = cpu_op_fp16_exec(cpu_input, cpu_input_weight, cpu_bias, cpu_running_mean, cpu_running_var, item[5], item[6], item[7])
-            npu_result = self.npu_op_exec(npu_input, npu_input_weight, npu_bias, npu_running_mean, npu_running_var, item[5], item[6], item[7])
-            self.assertRtolEqual(cpu_result, npu_result)
-
-instantiate_device_type_tests(TestIn2d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_instancenorm.py b/test/test_npu/test_instancenorm.py
deleted file mode 100644
index c1564541c49bb82257554f5633f1be607fb8b42f..0000000000000000000000000000000000000000
--- a/test/test_npu/test_instancenorm.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch 
-import numpy as np 
-import sys 
-import copy 
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestInstanceNorm(TestCase):
-
-    def generate_data(self, min, max, shape, dtype): 
-        x = np.random.uniform(min, max, shape).astype(dtype) 
-        w = np.random.uniform(min, max, shape).astype(dtype) 
-        b = np.random.uniform(min, max, shape).astype(dtype) 
-        rm = np.random.uniform(min, max, shape).astype(dtype) 
-        rv = np.random.uniform(min, max, shape).astype(dtype) 
-     
-        #modify from numpy.ndarray to torch.tensor 
-        npu_x = torch.from_numpy(x) 
-        npu_w = torch.from_numpy(w) 
-        npu_b = torch.from_numpy(b) 
-        npu_rm = torch.from_numpy(rm)
-        npu_rv = torch.from_numpy(rv) 
-         
-        return npu_x, npu_w,npu_b,npu_rm,npu_rv
-         
-    def generate_single_data(self, min, max, shape, dtype): 
-        input1 = np.random.uniform(min, max, shape).astype(dtype) 
-        npu_input1 = torch.from_numpy(input1) 
-         
-        return npu_input1 
-     
-     
-    def generate_three_data(self, min, max, shape, dtype): 
-        input1 = np.random.uniform(min, max, shape).astype(dtype) 
-        input2 = np.random.uniform(min, max, shape).astype(dtype) 
-        input3 = np.random.uniform(min, max, shape).astype(dtype) 
-     
-        #modify from numpy.ndarray to torch.tensor 
-        npu_input1 = torch.from_numpy(input1) 
-        npu_input2 = torch.from_numpy(input2) 
-        npu_input3 = torch.from_numpy(input3) 
-         
-        return npu_input1, npu_input2, npu_input3 
-     
-     
-    def cpu_op_exec(self, x, w,b,rm,rv,use_input_stats, momentum, eps): 
-        axis = []
-        for i in range(2,len(x.shape)):
-            axis.append(i)
-        mean = np.mean(x, tuple(axis), keepdims=True)
-        var = np.var(x, tuple(axis), keepdims=True)
-
-        if input_use ==True:
-            mean = (mean-momentum*mean) + momentum*rm
-            var = (var-momentum*var) + momentum*rv
-            print("11")
-            y = (x - mean)/np.sqrt(var + eps)
-            output = w*y + b
-        else:
-            y = (x - mean)/np.sqrt(var + eps)
-            output = w*y + b
-        output = output.numpy() 
-        return output 
-     
-     
-    def npu_op_exec(self, x, w,b,rm,rv,use_input_stats, momentum, eps): 
-        x = x.to("npu") 
-        w = w.to("npu") 
-        b = b.to("npu") 
-        rm = rm.to("npu") 
-        rv = rv.to("npu") 
-        axis = []
-        for i in range(2,len(x.shape)):
-            axis.append(i)
-        mean = np.mean(x, tuple(axis), keepdims=True)
-        var = np.var(x, tuple(axis), keepdims=True)
-
-        if input_use ==True:
-            mean = (mean-momentum*mean) + momentum*rm
-            var = (var-momentum*var) + momentum*rv
-            print("11")
-            y = (x - mean)/np.sqrt(var + eps)
-            output = w*y + b
-        else:
-            y = (x - mean)/np.sqrt(var + eps)
-            output = w*y + b
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output 
-         
-     
-    def npu_op_exec_scalar(self, input1, input2): 
-        input1 = input1.to("npu") 
-        output = input1 + input2 
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output 
-     
-     
-    def npu_op_exec_out(self, input1, input2, input3): 
-        input1 = input1.to("npu") 
-        input2 = input2.to("npu") 
-        output = input3.to("npu") 
-        torch.add(input1, input2, out=output) 
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output 
-         
-    def test_add_float16(self, device):
-        npu_x, npu_w,npu_b,npu_rm,npu_rv = self.generate_data(0, 100, (5, 6, 7), np.float16) 
-        cpu_output = self.cpu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) 
-        npu_output = self.npu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) 
-        self.assertRtolEqual(cpu_output, npu_output)
-     
-     
-    def test_add_float32(self, device):
-        npu_x, npu_w,npu_b,npu_rm,npu_rv = self.generate_data(0, 100, (5, 6, 7), np.float32) 
-        cpu_output = self.cpu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) 
-        npu_output = self.npu_op_exec(npu_x, npu_w,npu_b,npu_rm,npu_rv,True, 0.1, 0.00001) 
-        self.assertRtolEqual(cpu_output, npu_output)
-     
-     
-    def test_add_float32_out(self, device):
-        npu_input1, npu_input2, npu_input3  = generate_three_data(0, 100, (4,3), np.float32) 
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
-        npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3) 
-        self.assertRtolEqual(cpu_output, npu_output)
-     
-     
-    def test_add_float32_broadcast(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (4,3,1), np.float32) 
-        npu_input2 = self.generate_single_data(0, 100, (4,1,5), np.float32) 
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
-        npu_output = self.npu_op_exec(npu_input1, npu_input2) 
-        self.assertRtolEqual(cpu_output, npu_output)
-     
-     
-    def test_add_int32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2,3), np.int32) 
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
-        npu_output = self.npu_op_exec(npu_input1, npu_input2) 
-        self.assertRtolEqual(cpu_output, npu_output)
-     
-     
-    def test_add_scalar_float32(self, device):
-        npu_input1, _= self.generate_data(0, 100, (2,3), np.float32) 
-        cpu_output = self.cpu_op_exec(npu_input1, 1) 
-        npu_output = self.npu_op_exec_scalar(npu_input1, 1) 
-        self.assertRtolEqual(cpu_output, npu_output)
-     
-     
-    def npu_uncontiguous_op_exec_scalar(self, input1, input2): 
-        input1 = input1.to("npu") 
-        input1 = input1.as_strided([2,2], [1,2], 1) 
-        output = torch.add(input1, input2) 
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output 
-         
-    def cpu_uncontiguous_op_exec_scalar(self, input1, input2): 
-        input1 = input1.as_strided([2,2], [1,2], 1) 
-        output = torch.add(input1, input2) 
-        output = output.numpy() 
-        return output 
-         
-    def test_add_uncontiguous_float32_scalar(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (4,3), np.float32) 
-        cpu_input1 = copy.deepcopy(npu_input1) 
-        cpu_output = self.cpu_uncontiguous_op_exec_scalar(cpu_input1, 2) 
-        npu_output = self.npu_uncontiguous_op_exec_scalar(npu_input1, 2) 
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestInstanceNorm, globals(), except_for='cpu')     
-if __name__ == '__main__': 
-    torch.npu.set_device("npu:2") 
-    run_tests()
-
diff --git a/test/test_npu/test_multilabel_margin_loss_backward.py b/test/test_npu/test_multilabel_margin_loss_backward.py
deleted file mode 100644
index 30f22481f58db57753cf37df8f727cebecd90503..0000000000000000000000000000000000000000
--- a/test/test_npu/test_multilabel_margin_loss_backward.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from itertools import repeat, product
-
-class TestMultilabelMarginLossGrad(TestCase):
-
-    def generate_data(self, lo, hi, shape, dtype):
-        grad = np.random.uniform(lo, hi, (shape[0],)).astype(dtype)
-        predict = np.random.uniform(lo, hi, shape).astype(dtype)
-        npu_grad = torch.from_numpy(grad)
-        npu_predict = torch.from_numpy(predict)
-        return npu_grad, npu_predict
-    
-    def generate_target(self, lo, hi, shape, dtype):
-        target = np.random.randint(lo, hi, shape).astype(dtype)
-        npu_target = torch.from_numpy(target)
-        return npu_target
-    
-    def cpu_op_grad_exec(self, grad_output, predict, target, reduction):
-        predict.requires_grad = True
-        target = target.to(torch.int64)
-        out = torch.nn.functional.multilabel_margin_loss(input=predict, target=target, reduction=reduction)
-        if reduction == "none":
-            out.backward(grad_output)
-        else:
-            out.backward()
-        output = predict.grad.to(torch.float32).numpy()
-        return output
- 
-    def npu_op_grad_exec(self, grad_output, predict, target, reduction):
-        grad_output = grad_output.to("npu")
-        predict = predict.to("npu")
-        target = target.to("npu")
-        predict.requires_grad = True
-        out = torch.nn.functional.multilabel_margin_loss(input=predict, target=target, reduction=reduction)
-        if reduction == "none":
-            out.backward(grad_output)
-        else:
-            out.backward()
-        output = predict.grad.to("cpu").to(torch.float32).numpy()
-        return output
-
-    def test_multilabel_margin_loss_1(self, device):
-        for reduction in ["none", "mean", "sum"]:
-            grad, data = self.generate_data(-2, 2, (2, 4), np.float32)
-            target = self.generate_target(-1, 3, (2, 4), np.int32)
-            
-            data.requires_grad = False
-            cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction)
-            data.requires_grad = False
-            npu_output = self.npu_op_grad_exec(grad, data, target, reduction)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_multilabel_margin_loss_2(self, device):
-        for reduction in ["mean", "none", "sum"]:
-            grad, data = self.generate_data(-2, 2, (2, 9), np.float32)
-            target = self.generate_target(-1, 8, (2, 9), np.int32)
-            
-            data.requires_grad = False
-            cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction)
-            data.requires_grad = False
-            npu_output = self.npu_op_grad_exec(grad, data, target, reduction)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_multilabel_margin_loss_3(self, device):
-        for reduction in ["mean", "none", "sum"]:
-            grad, data = self.generate_data(-2, 2, (64, 147), np.float32)
-            target = self.generate_target(-1, 146, (64, 147), np.int32)
-            
-            data.requires_grad = False
-            cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction)
-            data.requires_grad = False
-            npu_output = self.npu_op_grad_exec(grad, data, target, reduction)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_multilabel_margin_loss_float16_1(self, device):
-        for reduction in ["mean", "none", "sum"]:
-            grad, data = self.generate_data(-2, 2, (2, 4), np.float16)
-            target = self.generate_target(-1, 3, (2, 4), np.int32)
-
-            data.requires_grad = False
-            grad = grad.to(torch.float32)
-            data = data.to(torch.float32)
-            cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction)
-            data.requires_grad = False
-            grad = grad.to(torch.float16)
-            data = data.to(torch.float16)
-            npu_output = self.npu_op_grad_exec(grad, data, target, reduction)
-            
-            cpu_output = cpu_output.astype(np.float16)
-            npu_output = npu_output.astype(np.float16)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_multilabel_margin_loss_float16_2(self, device):
-        for reduction in ["mean", "none", "sum"]:
-            grad, data = self.generate_data(-2, 2, (2, 9), np.float16)
-            target = self.generate_target(-1, 8, (2, 9), np.int32)
-            
-            data.requires_grad = False
-            grad = grad.to(torch.float32)
-            data = data.to(torch.float32)
-            cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction)
-            data.requires_grad = False
-            grad = grad.to(torch.float16)
-            data = data.to(torch.float16)
-            npu_output = self.npu_op_grad_exec(grad, data, target, reduction)
-            
-            cpu_output = cpu_output.astype(np.float16)
-            npu_output = npu_output.astype(np.float16)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_multilabel_margin_loss_float16_3(self, device):
-        for reduction in ["mean", "none", "sum"]:
-            grad, data = self.generate_data(-2, 2, (1, 79), np.float16)
-            target = self.generate_target(-1, 50, (1, 79), np.int32)
-            
-            data.requires_grad = False
-            grad = grad.to(torch.float32)
-            data = data.to(torch.float32)
-            cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction)
-            data.requires_grad = False
-            grad = grad.to(torch.float16)
-            data = data.to(torch.float16)
-            npu_output = self.npu_op_grad_exec(grad, data, target, reduction)
-            
-            cpu_output = cpu_output.astype(np.float16)
-            npu_output = npu_output.astype(np.float16)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_multilabel_margin_loss_float16_4(self, device):
-        for reduction in ["none", "sum", "mean"]:
-            grad, data = self.generate_data(-2, 2, (64, 147), np.float16)
-            target = self.generate_target(-1, 146, (64, 147), np.int32)
-            
-            data.requires_grad = False
-            grad = grad.to(torch.float32)
-            data = data.to(torch.float32)
-            cpu_output = self.cpu_op_grad_exec(grad, data, target, reduction)
-            data.requires_grad = False
-            grad = grad.to(torch.float16)
-            data = data.to(torch.float16)
-            npu_output = self.npu_op_grad_exec(grad, data, target, reduction)
-            
-            cpu_output = cpu_output.astype(np.float16)
-            npu_output = npu_output.astype(np.float16)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestMultilabelMarginLossGrad, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/test_npu/test_network_ops/run_tests.py b/test/test_npu/test_network_ops/run_tests.py
index 58f1a6454e431f514dc7cd67af76c8b443ef4f3a..69a7867d667ac0ac998e599d8cc201590f2bc404 100644
--- a/test/test_npu/test_network_ops/run_tests.py
+++ b/test/test_npu/test_network_ops/run_tests.py
@@ -42,18 +42,24 @@ def run_tests():
         import HTMLTestRunner
         with open(htmlFileName, "wb") as report_file:
             runner=HTMLTestRunner.HTMLTestRunner(stream=report_file, title='AllTest', description='all npu test case', verbosity=2)
-            runner.run(load_local_case(test_case_path))
+            result = runner.run(load_local_case(test_case_path))
+            if not result.wasSuccessful():
+                raise RuntimeError("Some cases of HTML unittest testset failed")
         print('report files path', htmlFileName)
     elif ENABLE_HTML_MX:
         print('start pytorch Multi HTML unittest testset...')
         import HtmlTestRunner
         runner=HtmlTestRunner.HTMLTESTRunner(output=test_report_path, verbosity=2)
-        runner=run(load_local_case(test_case_path))
+        result=runner.run(load_local_case(test_case_path))
+        if not result.wasSuccessful():
+            raise RuntimeError("Some cases of Multi HTML unittest testset failed")
     else:
         print('start pytorch TEXT unittest testset...')
         with open(txtFileName, "a") as report_file:
             runner=unittest.TextTestRunner(stream=report_file, verbosity=2)
-            runner.run(load_local_case(test_case_path))
+            result=runner.run(load_local_case(test_case_path))
+            if not result.wasSuccessful():
+                raise RuntimeError("Some cases TEXT unittest failed")
         print('report files path', txtFileName)
 
 if __name__=="__main__":
diff --git a/test/test_npu/test__Ixor__.py b/test/test_npu/test_network_ops/test__Ixor__.py
similarity index 100%
rename from test/test_npu/test__Ixor__.py
rename to test/test_npu/test_network_ops/test__Ixor__.py
diff --git a/test/test_npu/test___iand__.py b/test/test_npu/test_network_ops/test___iand__.py
similarity index 99%
rename from test/test_npu/test___iand__.py
rename to test/test_npu/test_network_ops/test___iand__.py
index d16107c8be5e949f02d39c4df45e382f367e4d6f..1270faeab0582fd9865cfefd88020d54725a8868 100644
--- a/test/test_npu/test___iand__.py
+++ b/test/test_npu/test_network_ops/test___iand__.py
@@ -134,5 +134,4 @@ class Test__Iand__(TestCase):
 
 instantiate_device_type_tests(Test__Iand__, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test__nnpack_spatial_convolution.py b/test/test_npu/test_network_ops/test__nnpack_spatial_convolution.py
similarity index 99%
rename from test/test_npu/test__nnpack_spatial_convolution.py
rename to test/test_npu/test_network_ops/test__nnpack_spatial_convolution.py
index daaed945793c43aa428931942e48aaf7e23e7abd..a89c9724848b1a55de65c325a11fad7adf835545 100644
--- a/test/test_npu/test__nnpack_spatial_convolution.py
+++ b/test/test_npu/test_network_ops/test__nnpack_spatial_convolution.py
@@ -138,7 +138,6 @@ class TestNnpackSpatialConvolution(TestCase):
 
 instantiate_device_type_tests(TestNnpackSpatialConvolution, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
 
 
diff --git a/test/test_npu/test_acos.py b/test/test_npu/test_network_ops/test_acos.py
similarity index 98%
rename from test/test_npu/test_acos.py
rename to test/test_npu/test_network_ops/test_acos.py
index 97bad337923a690f98a0beb328eaaf7a448e6f1b..bd03b4be44afd71929ec196694ce838c9e151e6f 100644
--- a/test/test_npu/test_acos.py
+++ b/test/test_npu/test_network_ops/test_acos.py
@@ -66,6 +66,5 @@ class TestAcos(TestCase):
 
 instantiate_device_type_tests(TestAcos, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
         
diff --git a/test/test_npu/test_adaptive_avg_pool2d_backward.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool2d_backward.py
similarity index 100%
rename from test/test_npu/test_adaptive_avg_pool2d_backward.py
rename to test/test_npu/test_network_ops/test_adaptive_avg_pool2d_backward.py
diff --git a/test/test_npu/test_adaptive_max_pool2d_backward.py b/test/test_npu/test_network_ops/test_adaptive_max_pool2d_backward.py
similarity index 100%
rename from test/test_npu/test_adaptive_max_pool2d_backward.py
rename to test/test_npu/test_network_ops/test_adaptive_max_pool2d_backward.py
diff --git a/test/test_npu/test_addbmm.py b/test/test_npu/test_network_ops/test_addbmm.py
similarity index 100%
rename from test/test_npu/test_addbmm.py
rename to test/test_npu/test_network_ops/test_addbmm.py
diff --git a/test/test_npu/test_addcdiv.py b/test/test_npu/test_network_ops/test_addcdiv.py
similarity index 100%
rename from test/test_npu/test_addcdiv.py
rename to test/test_npu/test_network_ops/test_addcdiv.py
diff --git a/test/test_npu/test_addmv.py b/test/test_npu/test_network_ops/test_addmv.py
similarity index 100%
rename from test/test_npu/test_addmv.py
rename to test/test_npu/test_network_ops/test_addmv.py
diff --git a/test/test_npu/test_addr.py b/test/test_npu/test_network_ops/test_addr.py
similarity index 100%
rename from test/test_npu/test_addr.py
rename to test/test_npu/test_network_ops/test_addr.py
diff --git a/test/test_npu/test_affine_grid_generator_backward.py b/test/test_npu/test_network_ops/test_affine_grid_generator_backward.py
similarity index 100%
rename from test/test_npu/test_affine_grid_generator_backward.py
rename to test/test_npu/test_network_ops/test_affine_grid_generator_backward.py
diff --git a/test/test_npu/test_asin.py b/test/test_npu/test_network_ops/test_asin.py
similarity index 98%
rename from test/test_npu/test_asin.py
rename to test/test_npu/test_network_ops/test_asin.py
index 54e32964b870ed52dc84ca4d629d458df8d610fb..537bbc12bc8f55719798de68a8a1d3c093dc1459 100644
--- a/test/test_npu/test_asin.py
+++ b/test/test_npu/test_network_ops/test_asin.py
@@ -62,5 +62,4 @@ class TestAsin(TestCase):
 
 instantiate_device_type_tests(TestAsin, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_bartlett_window.py b/test/test_npu/test_network_ops/test_bartlett_window.py
similarity index 98%
rename from test/test_npu/test_bartlett_window.py
rename to test/test_npu/test_network_ops/test_bartlett_window.py
index 2cfa2aefb345e048a6be4ba3233e826ecbf3ddea..4a9be1452610e1de63358d9996da3dcd89f74b09 100644
--- a/test/test_npu/test_bartlett_window.py
+++ b/test/test_npu/test_network_ops/test_bartlett_window.py
@@ -78,5 +78,4 @@ class TestBartlettWindow(TestCase):
 
 instantiate_device_type_tests(TestBartlettWindow, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
     run_tests()
diff --git a/test/test_npu/test_batch_norm.py b/test/test_npu/test_network_ops/test_batch_norm.py
similarity index 100%
rename from test/test_npu/test_batch_norm.py
rename to test/test_npu/test_network_ops/test_batch_norm.py
diff --git a/test/test_npu/test_binary_cross_entropy_with_logits.py b/test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits.py
similarity index 100%
rename from test/test_npu/test_binary_cross_entropy_with_logits.py
rename to test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits.py
diff --git a/test/test_npu/test_network_ops/test_bitwise_not.py b/test/test_npu/test_network_ops/test_bitwise_not.py
index b83234feb6ae4a97337082f8d55e108be006bea3..a10801063b1b47b54932dac4080ed43f9c79a6a5 100644
--- a/test/test_npu/test_network_ops/test_bitwise_not.py
+++ b/test/test_npu/test_network_ops/test_bitwise_not.py
@@ -101,5 +101,4 @@ class Test_Bitwise_Not(TestCase):
 
 instantiate_device_type_tests(Test_Bitwise_Not, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
diff --git a/test/test_npu/test_blackman_window.py b/test/test_npu/test_network_ops/test_blackman_window.py
similarity index 98%
rename from test/test_npu/test_blackman_window.py
rename to test/test_npu/test_network_ops/test_blackman_window.py
index 8a600bb0805ac1229cf9f7dad8ac6434e804cb2e..10b7dd27924eccd7f91ba4394d458d51dc5d5c30 100644
--- a/test/test_npu/test_blackman_window.py
+++ b/test/test_npu/test_network_ops/test_blackman_window.py
@@ -92,5 +92,4 @@ class TestBlackmanWindow(TestCase):
     
 instantiate_device_type_tests(TestBlackmanWindow, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_network_ops/test_bmm_v2.py b/test/test_npu/test_network_ops/test_bmm_v2.py
index 6ea92d3e4fba286eca8c3f43220bf3993bc94aa5..3590e3ecbd712854f5f1fe79e215c032b9f29886 100644
--- a/test/test_npu/test_network_ops/test_bmm_v2.py
+++ b/test/test_npu/test_network_ops/test_bmm_v2.py
@@ -25,7 +25,7 @@ class TestBatchMatMulV2(TestCase):
       return output
 
   def npu_op_exec(self, input1, input2):
-      output = torch.npu_bmmV2(input1, input2)
+      output = torch.npu_bmmV2(input1, input2, [])
       output = output.to("cpu")
       output = output.numpy()
       return output
diff --git a/test/test_npu/test_cast_Byte.py b/test/test_npu/test_network_ops/test_cast_Byte.py
similarity index 97%
rename from test/test_npu/test_cast_Byte.py
rename to test/test_npu/test_network_ops/test_cast_Byte.py
index c06faec158068025b17af25a9da53a50e4f54d5b..1393faf3234192127a35e94e07df13f2c7e77cd2 100644
--- a/test/test_npu/test_cast_Byte.py
+++ b/test/test_npu/test_network_ops/test_cast_Byte.py
@@ -20,7 +20,7 @@ import sys
 import copy
 from common_utils import TestCase, run_tests
 from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor,compare_res_new
+from util_test import create_common_tensor
 
 
 class TestCastByte(TestCase):
diff --git a/test/test_npu/test_cast_Char.py b/test/test_npu/test_network_ops/test_cast_Char.py
similarity index 100%
rename from test/test_npu/test_cast_Char.py
rename to test/test_npu/test_network_ops/test_cast_Char.py
diff --git a/test/test_npu/test_cast_Float.py b/test/test_npu/test_network_ops/test_cast_Float.py
similarity index 100%
rename from test/test_npu/test_cast_Float.py
rename to test/test_npu/test_network_ops/test_cast_Float.py
diff --git a/test/test_npu/test_cast_Half.py b/test/test_npu/test_network_ops/test_cast_Half.py
similarity index 100%
rename from test/test_npu/test_cast_Half.py
rename to test/test_npu/test_network_ops/test_cast_Half.py
diff --git a/test/test_npu/test_cast_Int.py b/test/test_npu/test_network_ops/test_cast_Int.py
similarity index 100%
rename from test/test_npu/test_cast_Int.py
rename to test/test_npu/test_network_ops/test_cast_Int.py
diff --git a/test/test_npu/test_cast_Long.py b/test/test_npu/test_network_ops/test_cast_Long.py
similarity index 100%
rename from test/test_npu/test_cast_Long.py
rename to test/test_npu/test_network_ops/test_cast_Long.py
diff --git a/test/test_npu/test_cast_Short.py b/test/test_npu/test_network_ops/test_cast_Short.py
similarity index 100%
rename from test/test_npu/test_cast_Short.py
rename to test/test_npu/test_network_ops/test_cast_Short.py
diff --git a/test/test_npu/test_cdist.py b/test/test_npu/test_network_ops/test_cdist.py
similarity index 99%
rename from test/test_npu/test_cdist.py
rename to test/test_npu/test_network_ops/test_cdist.py
index b7b0fd03b83d5e41331c302b1d3f78028c568ade..fa6553d8428219658417b6dc6e87f04db546d2cc 100644
--- a/test/test_npu/test_cdist.py
+++ b/test/test_npu/test_network_ops/test_cdist.py
@@ -190,5 +190,4 @@ class Testcdist(TestCase):
 
 instantiate_device_type_tests(Testcdist, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
diff --git a/test/test_npu/test_cdist_backward.py b/test/test_npu/test_network_ops/test_cdist_backward.py
similarity index 99%
rename from test/test_npu/test_cdist_backward.py
rename to test/test_npu/test_network_ops/test_cdist_backward.py
index d0ee34ce400a9198e4307d69978e6b897f4d0af4..a61f69d70e980c70ecec68a2637dcf9a5ec513da 100644
--- a/test/test_npu/test_cdist_backward.py
+++ b/test/test_npu/test_network_ops/test_cdist_backward.py
@@ -115,5 +115,4 @@ class Testcdist(TestCase):
 
 instantiate_device_type_tests(Testcdist, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
diff --git a/test/test_npu/test_celu.py b/test/test_npu/test_network_ops/test_celu.py
similarity index 64%
rename from test/test_npu/test_celu.py
rename to test/test_npu/test_network_ops/test_celu.py
index 1dc6a7c49bf27e4a706077f5613617cae9d39611..1ef1fe9f19cb4d27bb3741b8201f2a717d73f84b 100644
--- a/test/test_npu/test_celu.py
+++ b/test/test_npu/test_network_ops/test_celu.py
@@ -28,27 +28,49 @@ class TestCelu(TestCase):
         npu_input = torch.from_numpy(input_x)
         return npu_input
 
-    def cpu_op_exec(self, input1, alpha):
+    def cpu_op_exec_functional(self, input1, alpha):
         output = torch.nn.functional.celu(input1, alpha=alpha)
         output = output.numpy()
         return output
 
-    def npu_op_exec(self, input1, alpha):
+    def npu_op_exec_functional(self, input1, alpha):
         output = torch.nn.functional.celu(input1, alpha=alpha)
         output = output.to("cpu")
         output = output.numpy()
         return output
+    
+    def cpu_op_exec(self, input1, alpha):
+        output = torch.celu(input1, alpha=alpha)
+        output = output.numpy()
+        return output
 
-    def cpu_op_inplace_exec(self, input1, alpha):
+    def npu_op_exec(self, input1, alpha):
+        output = torch.celu(input1, alpha=alpha)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_functional(self, input1, alpha):
         output = torch.nn.functional.celu_(input1, alpha=alpha)
         output = output.numpy()
         return output
 
-    def npu_op_inplace_exec(self, input1, alpha):
+    def npu_op_inplace_exec_functional(self, input1, alpha):
         output = torch.nn.functional.celu_(input1, alpha=alpha)
         output = output.to("cpu")
         output = output.numpy()
         return output
+    
+    def cpu_op_inplace_exec(self, input1, alpha):
+        output = torch.celu_(input1, alpha=alpha)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, alpha):
+        output = torch.celu_(input1, alpha=alpha)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
 
     def test_celu_3_3_float32_alpha1(self, device):
         input_x1 = self.generate_data(-1, 1, (3, 3), np.float32)
@@ -157,8 +179,58 @@ class TestCelu(TestCase):
             cpu_output = self.cpu_op_inplace_exec(cpu_input1, 2.0)
             npu_output = self.npu_op_inplace_exec(npu_input1, 2.0)
             self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_celu_inplace_shape_format_alpha_range(self, device):
+        shape_format_alpha_range = [
+            # [[dtype, format, shape], alpha, min, max]
+            [[np.float16, 2, (16, 5, 7, 11)], 5.6, -2, 2],
+            [[np.float32, 2, (16, 5, 7, 11)], 0.5, -2, 2],
+            [[np.float32, 2, (16, 5, 7, 11)], 0.7, -2, 2],
+            [[np.float32, 2, (16, 5, 7, 11)], 2.6, -2, 2],
+            [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.0078125, 0.0078125],
+            [[np.float16, 2, (16, 136, 5, 4)], 0.7, -0.0078125, 0.0078125],
+            [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.01, 0.01],
+            [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 0.5, -2, 2],
+            [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 5.4, -2, 2],
+            [[np.float16, 2, (23, 5, 11, 50, 26, 13, 1, 23)], 0.5, -2, 2],
+            [[np.float16, 2, (2560, 17)], 0.5, -2, 2],
+            [[np.float16, 2, (2560, 17)], 5.4, -2, 2]
+        ]
+        for item in shape_format_alpha_range:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[2], item[3])
+            alpha = item[1]
+            npu_output = self.npu_op_inplace_exec(npu_input1, alpha)
+            if item[0][0] == np.float16:
+                cpu_output = self.cpu_op_inplace_exec(cpu_input1.float(), alpha).astype(np.float16)
+            else:
+                cpu_output = self.cpu_op_inplace_exec(cpu_input1, alpha)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_celu_inplace_shape_format_alpha_range(self, device):
+        shape_format_alpha_range = [
+            # [[dtype, format, shape], alpha, min, max]
+            [[np.float32, 2, (16, 5, 7, 11)], 0.5, -2, 2],
+            [[np.float32, 2, (16, 5, 7, 11)], 0.7, -2, 2],
+            [[np.float32, 2, (16, 5, 7, 11)], 2.6, -2, 2],
+            [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.0078125, 0.0078125],
+            [[np.float16, 2, (16, 136, 5, 4)], 0.7, -0.0078125, 0.0078125],
+            [[np.float16, 2, (16, 136, 5, 4)], 0.5, -0.01, 0.01],
+            [[np.float16, 2, (16, 136, 5, 4)], 0.7, -0.01, 0.01],
+            [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 0.5, -2, 2],
+            [[np.float16, 2, (176, 3, 67, 47, 5, 12)], 5.4, -2, 2],
+            [[np.float16, 2, (2560, 17)], 0.5, -2, 2],
+            [[np.float16, 2, (2560, 17)], 5.4, -2, 2]
+        ]
+        for item in shape_format_alpha_range:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[2], item[3])
+            alpha = item[1]
+            npu_output = self.npu_op_exec(npu_input1, alpha)
+            if item[0][0] == np.float16:
+                cpu_output = self.cpu_op_exec(cpu_input1.float(), alpha).astype(np.float16)
+            else:
+                cpu_output = self.cpu_op_exec(cpu_input1, alpha)
+            self.assertRtolEqual(cpu_output, npu_output)
 
 instantiate_device_type_tests(TestCelu, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/test/test_npu/test_conv_tbc.py b/test/test_npu/test_network_ops/test_conv_tbc.py
similarity index 90%
rename from test/test_npu/test_conv_tbc.py
rename to test/test_npu/test_network_ops/test_conv_tbc.py
index aeb8eca4a2500760ae6bc1781a7e0956ffec9d9e..47d799e33ad3c0b215a7bc41a0380faf90812fe8 100644
--- a/test/test_npu/test_conv_tbc.py
+++ b/test/test_npu/test_network_ops/test_conv_tbc.py
@@ -27,8 +27,6 @@ class TestConvTbc(TestCase):
     def op_exec_cpu(self, input1, weight, bias, pad):
         cpu_output = torch.conv_tbc(input1, weight, bias, pad)
         cpu_output = cpu_output.numpy().astype('float16')
-        print("===cpu_output===")
-        print(cpu_output)
         return cpu_output
 
     def op_exec_npu(self, input1, weight, bias, pad):
@@ -38,8 +36,6 @@ class TestConvTbc(TestCase):
         npu_output = torch.conv_tbc(input1, weight, bias, pad)
         npu_output = npu_output.to("cpu")
         npu_output = npu_output.numpy().astype('float16')
-        print("===npu_output===")
-        print(npu_output)
         return npu_output
 
     def test_conv_tbc_shape_format(self, device):
@@ -55,11 +51,8 @@ class TestConvTbc(TestCase):
         pad = 1
         cpu_output = self.op_exec_cpu(cpu_input, cpu_weight, cpu_bias, pad)
         npu_output = self.op_exec_npu(npu_input, npu_weight, npu_bias, pad)
-        res = abs((cpu_output - npu_output)/cpu_output)
-        print(res)
         self.assertRtolEqual(cpu_output, npu_output)
 
 instantiate_device_type_tests(TestConvTbc, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
diff --git a/test/test_npu/test_conv_transpose2d.py b/test/test_npu/test_network_ops/test_conv_transpose2d.py
similarity index 63%
rename from test/test_npu/test_conv_transpose2d.py
rename to test/test_npu/test_network_ops/test_conv_transpose2d.py
index e62981ef9af99b89a3c48d03905bc694d0095571..6e9b72b96802b44e25f7dcaaa6dafc7d0fff9bd6 100644
--- a/test/test_npu/test_conv_transpose2d.py
+++ b/test/test_npu/test_network_ops/test_conv_transpose2d.py
@@ -23,50 +23,62 @@ from util_test import create_common_tensor
 
 
 class TestConvTranspose2d(TestCase):
-    def cpu_op_exec(self, input, weight):
-        cpu_output = torch.nn.functional.conv_transpose2d(input, weight,bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1)
+    def cpu_op_exec(self, input, weight, groups):
+        cpu_output = torch.nn.functional.conv_transpose2d(input, weight,bias=None, 
+                            stride=1, padding=0, output_padding=0, groups=groups, dilation=1)
         cpu_output = cpu_output.numpy()
         return cpu_output
 
-    def cpu_op_exec_fp16(self, input, weight):
+    def cpu_op_exec_fp16(self, input, weight, groups):
         input = input.to(torch.float32)
         weight = weight.to(torch.float32)
-        cpu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1)
+        cpu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, 
+                            stride=1, padding=0, output_padding=0, groups=groups, dilation=1)
         cpu_output = cpu_output.numpy()
         cpu_output = cpu_output.astype(np.float16)
 
         return cpu_output
 
-    def npu_op_exec(self, input, weight):
+    def npu_op_exec(self, input, weight, groups):
         input = input.to("npu")
         weight = weight.to("npu")
-        npu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1)
+        npu_output = torch.nn.functional.conv_transpose2d(input, weight, bias=None, 
+                            stride=1, padding=0, output_padding=0, groups=groups, dilation=1)
         npu_output = npu_output.to("cpu").numpy()
 
         return npu_output
 
     def test_conv_transpose2d(self, device):
-        shape_format = [  # input, weight
-            [[np.float16, 3, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]]],
-            [[np.float16, 3, [1024, 58, 28, 28]], [np.float16, 3, [58, 58, 1, 1]]],
-            [[np.float16, 4, [1024, 3, 224, 224]], [np.float16, 4, [3, 3, 3, 3]]],
-            [[np.float16, 0, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]]],
-            [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 4, [232, 232, 1, 1]]],
-            [[np.float16, 4, [1024, 58, 28, 28]], [np.float16, 4, [58, 58, 1, 1]]],
-            [[np.float16, 0, [1024, 24, 56, 56]], [np.float16, 4, [24, 24, 1, 1]]],
-            [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 4, [128, 128, 3, 3]]],
-            [[np.float32, 4, [256, 3, 224, 224]], [np.float32, 4, [3, 3, 7, 7]]],
-            [[np.float32, 3, [2, 3, 3, 3]], [np.float32, 4, [3, 1, 3, 3]]],
-            [[np.float32, 3, [1024, 232, 7, 7]], [np.float32, 4, [232, 232, 1, 1]]],
+        shape_format = [  
+            # input, weight
+            [[np.float16, 3, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]], 1],
+            [[np.float16, 3, [1024, 58, 28, 28]], [np.float16, 3, [58, 58, 1, 1]], 1],
+            [[np.float16, 4, [1024, 3, 224, 224]], [np.float16, 4, [3, 3, 3, 3]], 1],
+            [[np.float16, 0, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]], 1],
+            [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 4, [232, 232, 1, 1]], 1],
+            [[np.float16, 4, [1024, 58, 28, 28]], [np.float16, 4, [58, 58, 1, 1]], 1],
+            [[np.float16, 0, [1024, 24, 56, 56]], [np.float16, 4, [24, 24, 1, 1]], 1],
+            [[np.float32, 0, [256, 128, 7, 7]], [np.float32, 4, [128, 128, 3, 3]], 1],
+            [[np.float32, 4, [256, 3, 224, 224]], [np.float32, 4, [3, 3, 7, 7]], 1],
+            [[np.float32, 3, [2, 3, 3, 3]], [np.float32, 4, [3, 1, 3, 3]], 1],
+            [[np.float32, 3, [1024, 232, 7, 7]], [np.float32, 4, [232, 232, 1, 1]], 1],
+            [[np.float16, 3, [1024, 116*3, 14, 14]], [np.float16, 4, [116*3, 150//3, 1, 1]], 3],
+            [[np.float16, 3, [1024, 58*2, 28, 28]], [np.float16, 3, [58*2, 58//2, 1, 1]], 2],
+            [[np.float16, 0, [1, 3*3, 224, 224]], [np.float16, 0, [3*3, 1, 3, 3]], 3],
+            [[np.float16, 0, [1024, 116*4, 14, 14]], [np.float16, 4, [116*4, 116//4, 1, 1]], 4],
+            [[np.float32, 3, [1024, 116*3, 14, 14]], [np.float32, 4, [116*3, 150//3, 1, 1]], 3],
+            [[np.float32, 3, [1024, 58*2, 28, 28]], [np.float32, 3, [58*2, 58//2, 1, 1]], 2],
+            [[np.float32, 0, [1, 3*3, 224, 224]], [np.float32, 0, [3*3, 1, 3, 3]], 3],
+            [[np.float32, 0, [1024, 116*4, 14, 14]], [np.float32, 4, [116*4, 116//4, 1, 1]], 4],
         ]
         for item in shape_format:
             input_cpu, input_npu = create_common_tensor(item[0], 0, 10)
             weight_cpu, weight_npu = create_common_tensor(item[1], 0, 10)
             if input_cpu.dtype == torch.float16:
-                cpu_output = self.cpu_op_exec_fp16(input_cpu, weight_cpu)
+                cpu_output = self.cpu_op_exec_fp16(input_cpu, weight_cpu, item[-1])
             else:
-                cpu_output = self.cpu_op_exec(input_cpu, weight_cpu)
-            npu_output = self.npu_op_exec(input_npu, weight_npu)
+                cpu_output = self.cpu_op_exec(input_cpu, weight_cpu, item[-1])
+            npu_output = self.npu_op_exec(input_npu, weight_npu, item[-1])
             # fp32精度不足，放宽对其精度要求
             self.assertRtolEqual(cpu_output, npu_output, prec=1.e-1)
 
diff --git a/test/test_npu/test_convolution_backward_weight.py b/test/test_npu/test_network_ops/test_convolution_backward_weight.py
similarity index 95%
rename from test/test_npu/test_convolution_backward_weight.py
rename to test/test_npu/test_network_ops/test_convolution_backward_weight.py
index de421a9552067d7bf36b3fa07342b4202ecbf83f..beaf25c285e64c0f8c1c81f48ab61f10e3a7f369 100644
--- a/test/test_npu/test_convolution_backward_weight.py
+++ b/test/test_npu/test_network_ops/test_convolution_backward_weight.py
@@ -17,7 +17,7 @@ import numpy as np
 import sys
 import copy
 from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
+from common_device_type import instantiate_device_type_tests
 from util_test import create_common_tensor
 
 
@@ -98,13 +98,12 @@ class TestCudnnConvolutionBackwardWeight(TestCase):
                                           item[3], item[4], item[5])
             cpu_output = cpu_output.astype(npu_output.dtype)
             cpu_dweight = cpu_dweight.to(npu_dweight.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_dweight, npu_dweight)
+            self.assertRtolEqual(cpu_output, npu_output, 0.007)
+            self.assertRtolEqual(cpu_dweight, npu_dweight, 0.003)
 
 
 instantiate_device_type_tests(TestCudnnConvolutionBackwardWeight,
                               globals(),
                               except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_convolution_transpose_backward_weight.py b/test/test_npu/test_network_ops/test_convolution_transpose_backward_weight.py
similarity index 99%
rename from test/test_npu/test_convolution_transpose_backward_weight.py
rename to test/test_npu/test_network_ops/test_convolution_transpose_backward_weight.py
index 76fc807c7166a17b2b23aba6a75a439b3156b93f..99f99300873518835324658c9ef154339769921e 100644
--- a/test/test_npu/test_convolution_transpose_backward_weight.py
+++ b/test/test_npu/test_network_ops/test_convolution_transpose_backward_weight.py
@@ -110,5 +110,4 @@ instantiate_device_type_tests(TestCudnnConvolutionTransposeBackwardWeight,
                               globals(),
                               except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_copy.py b/test/test_npu/test_network_ops/test_copy.py
similarity index 99%
rename from test/test_npu/test_copy.py
rename to test/test_npu/test_network_ops/test_copy.py
index 6b8e062e93f1a14ab549666bb84c6af8e5d72414..cf98bd4150d4c46173990ab6110bf3808883df42 100644
--- a/test/test_npu/test_copy.py
+++ b/test/test_npu/test_network_ops/test_copy.py
@@ -137,5 +137,4 @@ class TestCopy(TestCase):
 
 instantiate_device_type_tests(TestCopy, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
     run_tests()
diff --git a/test/test_npu/test_cos.py b/test/test_npu/test_network_ops/test_cos.py
similarity index 98%
rename from test/test_npu/test_cos.py
rename to test/test_npu/test_network_ops/test_cos.py
index 6756d47f57c90f7d2484bf5656958c02180acc00..b247cd0a0f2d8de61375e800d1b7b724a82d0f17 100644
--- a/test/test_npu/test_cos.py
+++ b/test/test_npu/test_network_ops/test_cos.py
@@ -62,5 +62,4 @@ class TestCos(TestCase):
 
 instantiate_device_type_tests(TestCos, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_cosh.py b/test/test_npu/test_network_ops/test_cosh.py
similarity index 99%
rename from test/test_npu/test_cosh.py
rename to test/test_npu/test_network_ops/test_cosh.py
index 1ba58569b7543cd3b0cacb7508dec7a4f629e378..8ac14a077f5e7f7be6f3be5347d6f822e2aa3167 100644
--- a/test/test_npu/test_cosh.py
+++ b/test/test_npu/test_network_ops/test_cosh.py
@@ -145,5 +145,4 @@ class TestCosh(TestCase):
 
 instantiate_device_type_tests(TestCosh, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_cosinesimilarity.py b/test/test_npu/test_network_ops/test_cosinesimilarity.py
similarity index 100%
rename from test/test_npu/test_cosinesimilarity.py
rename to test/test_npu/test_network_ops/test_cosinesimilarity.py
diff --git a/test/test_npu/test_cross.py b/test/test_npu/test_network_ops/test_cross.py
similarity index 100%
rename from test/test_npu/test_cross.py
rename to test/test_npu/test_network_ops/test_cross.py
diff --git a/test/test_npu/test_cudnn_convolution_backward_bias.py b/test/test_npu/test_network_ops/test_cudnn_convolution_backward_bias.py
similarity index 99%
rename from test/test_npu/test_cudnn_convolution_backward_bias.py
rename to test/test_npu/test_network_ops/test_cudnn_convolution_backward_bias.py
index 6e274874701e6cbb40cf14d7515fd5941a6b6c57..51168f2b716310cc0bedddcf20bd0d3a4dd97ddf 100644
--- a/test/test_npu/test_cudnn_convolution_backward_bias.py
+++ b/test/test_npu/test_network_ops/test_cudnn_convolution_backward_bias.py
@@ -91,5 +91,4 @@ class TestCudnnConvolutionBackwardBias(TestCase):
 
 instantiate_device_type_tests(TestCudnnConvolutionBackwardBias, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
diff --git a/test/test_npu/test_cudnn_convolution_transpose_backward_bias.py b/test/test_npu/test_network_ops/test_cudnn_convolution_transpose_backward_bias.py
similarity index 99%
rename from test/test_npu/test_cudnn_convolution_transpose_backward_bias.py
rename to test/test_npu/test_network_ops/test_cudnn_convolution_transpose_backward_bias.py
index f5271d4197ac72fb5834481e3f74d22e90b78a29..aa912c86bb3329319ac0f7acb613a6ab382a9d60 100644
--- a/test/test_npu/test_cudnn_convolution_transpose_backward_bias.py
+++ b/test/test_npu/test_network_ops/test_cudnn_convolution_transpose_backward_bias.py
@@ -91,5 +91,4 @@ class TestCudnnConvolutionTransposeBackwardBias(TestCase):
 
 instantiate_device_type_tests(TestCudnnConvolutionTransposeBackwardBias, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
diff --git a/test/test_npu/test_cumprod.py b/test/test_npu/test_network_ops/test_cumprod.py
similarity index 100%
rename from test/test_npu/test_cumprod.py
rename to test/test_npu/test_network_ops/test_cumprod.py
diff --git a/test/test_npu/test_cumsum.py b/test/test_npu/test_network_ops/test_cumsum.py
similarity index 100%
rename from test/test_npu/test_cumsum.py
rename to test/test_npu/test_network_ops/test_cumsum.py
diff --git a/test/test_npu/test_dim_arange.py b/test/test_npu/test_network_ops/test_dim_arange.py
similarity index 100%
rename from test/test_npu/test_dim_arange.py
rename to test/test_npu/test_network_ops/test_dim_arange.py
diff --git a/test/test_npu/test_diml.py b/test/test_npu/test_network_ops/test_diml.py
similarity index 100%
rename from test/test_npu/test_diml.py
rename to test/test_npu/test_network_ops/test_diml.py
diff --git a/test/test_npu/test_dirichlet_grad.py b/test/test_npu/test_network_ops/test_dirichlet_grad.py
similarity index 100%
rename from test/test_npu/test_dirichlet_grad.py
rename to test/test_npu/test_network_ops/test_dirichlet_grad.py
diff --git a/test/test_npu/test_dot.py b/test/test_npu/test_network_ops/test_dot.py
similarity index 99%
rename from test/test_npu/test_dot.py
rename to test/test_npu/test_network_ops/test_dot.py
index 74edec125353e8555763d876fcd60f98c492f668..44b17ab5f7a62e43521623789d0ccc51c3d74d4e 100644
--- a/test/test_npu/test_dot.py
+++ b/test/test_npu/test_network_ops/test_dot.py
@@ -96,6 +96,5 @@ class TestDot(TestCase):
 
 instantiate_device_type_tests(TestDot, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
 
diff --git a/test/test_npu/test_elu.py b/test/test_npu/test_network_ops/test_elu.py
similarity index 100%
rename from test/test_npu/test_elu.py
rename to test/test_npu/test_network_ops/test_elu.py
diff --git a/test/test_npu/test_embedding.py b/test/test_npu/test_network_ops/test_embedding.py
similarity index 100%
rename from test/test_npu/test_embedding.py
rename to test/test_npu/test_network_ops/test_embedding.py
diff --git a/test/test_npu/test_equal.py b/test/test_npu/test_network_ops/test_equal.py
similarity index 100%
rename from test/test_npu/test_equal.py
rename to test/test_npu/test_network_ops/test_equal.py
diff --git a/test/test_npu/test_erf.py b/test/test_npu/test_network_ops/test_erf.py
similarity index 99%
rename from test/test_npu/test_erf.py
rename to test/test_npu/test_network_ops/test_erf.py
index 6cc76f3f5861885cc8a58244adf1f0fd8b30bb96..e35bdb796394b60351e05325de5839f3530341c0 100644
--- a/test/test_npu/test_erf.py
+++ b/test/test_npu/test_network_ops/test_erf.py
@@ -113,5 +113,4 @@ class TestErf(TestCase):
    
 instantiate_device_type_tests(TestErf, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_erfinv.py b/test/test_npu/test_network_ops/test_erfinv.py
similarity index 99%
rename from test/test_npu/test_erfinv.py
rename to test/test_npu/test_network_ops/test_erfinv.py
index 8eb7e68bfdd6ad87b91f6bc5cd16ecd7b0a8ecf3..4e033a4e85bdb57ad410ebc0c9e659fbe5bab059 100644
--- a/test/test_npu/test_erfinv.py
+++ b/test/test_npu/test_network_ops/test_erfinv.py
@@ -120,5 +120,4 @@ class TestErfinv(TestCase):
 
 instantiate_device_type_tests(TestErfinv, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
diff --git a/test/test_npu/test_expm1.py b/test/test_npu/test_network_ops/test_expm1.py
similarity index 99%
rename from test/test_npu/test_expm1.py
rename to test/test_npu/test_network_ops/test_expm1.py
index 52899245f82e934699b5cd9c513aa3bb9a6b5d8e..6b8cdafc746d2b10566a203323fc7a394bab27d6 100644
--- a/test/test_npu/test_expm1.py
+++ b/test/test_npu/test_network_ops/test_expm1.py
@@ -165,5 +165,4 @@ class TestExpm1(TestCase):
 instantiate_device_type_tests(TestExpm1, globals(), except_for="cpu")
 
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_eye.py b/test/test_npu/test_network_ops/test_eye.py
similarity index 99%
rename from test/test_npu/test_eye.py
rename to test/test_npu/test_network_ops/test_eye.py
index e642baaa30063e78ca77bee5b26e2dc35c1c36df..03f4021cdae673a40b5ea13cda2bd435d6463523 100644
--- a/test/test_npu/test_eye.py
+++ b/test/test_npu/test_network_ops/test_eye.py
@@ -141,5 +141,4 @@ class TestEye(TestCase):
 
 instantiate_device_type_tests(TestEye, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_floordivide.py b/test/test_npu/test_network_ops/test_floordivide.py
similarity index 100%
rename from test/test_npu/test_floordivide.py
rename to test/test_npu/test_network_ops/test_floordivide.py
diff --git a/test/test_npu/test_frac.py b/test/test_npu/test_network_ops/test_frac.py
similarity index 99%
rename from test/test_npu/test_frac.py
rename to test/test_npu/test_network_ops/test_frac.py
index dcb781a8d36ba235fc2383921d6b1121c28bc71e..4929dba2892ebec58db5037c0c191c8451983870 100644
--- a/test/test_npu/test_frac.py
+++ b/test/test_npu/test_network_ops/test_frac.py
@@ -147,5 +147,4 @@ class TestFrac(TestCase):
 
 instantiate_device_type_tests(TestFrac, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_frobenius_norm.py b/test/test_npu/test_network_ops/test_frobenius_norm.py
similarity index 99%
rename from test/test_npu/test_frobenius_norm.py
rename to test/test_npu/test_network_ops/test_frobenius_norm.py
index 202974470b3381bf1816f8f87b1815cc64fea973..bd383adb49469aaee56e5706ce41700657b29e06 100644
--- a/test/test_npu/test_frobenius_norm.py
+++ b/test/test_npu/test_network_ops/test_frobenius_norm.py
@@ -172,5 +172,4 @@ class TestFrobenius_norm(TestCase):
             self.assertRtolEqual(cpu_output, npu_output)
 instantiate_device_type_tests(TestFrobenius_norm, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:7")
     run_tests()
diff --git a/test/test_npu/test_full_like.py b/test/test_npu/test_network_ops/test_full_like.py
similarity index 99%
rename from test/test_npu/test_full_like.py
rename to test/test_npu/test_network_ops/test_full_like.py
index 36d5f6378f13c8b320d2afc2b826afdcb2e16d14..26b62c6ee121db582d3a23c28b8c1404682f306c 100644
--- a/test/test_npu/test_full_like.py
+++ b/test/test_npu/test_network_ops/test_full_like.py
@@ -84,5 +84,4 @@ class TestFullLike(TestCase):
         
 instantiate_device_type_tests(TestFullLike, globals(), except_for='cpu')
 if __name__ == '__main__':
-    torch.npu.set_device("npu:3")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_gelu.py b/test/test_npu/test_network_ops/test_gelu.py
similarity index 100%
rename from test/test_npu/test_gelu.py
rename to test/test_npu/test_network_ops/test_gelu.py
diff --git a/test/test_npu/test_glu.py b/test/test_npu/test_network_ops/test_glu.py
similarity index 98%
rename from test/test_npu/test_glu.py
rename to test/test_npu/test_network_ops/test_glu.py
index 85167f0ab8d97ee3196b61c044bc2926775359fc..073ad2deb620f52127eb81f6143004a359a2beb0 100644
--- a/test/test_npu/test_glu.py
+++ b/test/test_npu/test_network_ops/test_glu.py
@@ -77,5 +77,4 @@ class TestGlu(TestCase):
 
 instantiate_device_type_tests(TestGlu, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_glugrad.py b/test/test_npu/test_network_ops/test_glugrad.py
similarity index 99%
rename from test/test_npu/test_glugrad.py
rename to test/test_npu/test_network_ops/test_glugrad.py
index c2e546bd28907330c7c04ee0234845b91d94dd16..4050bca25c8ee89bc1be24555f2f4566170b9c11 100644
--- a/test/test_npu/test_glugrad.py
+++ b/test/test_npu/test_network_ops/test_glugrad.py
@@ -87,5 +87,4 @@ class TestGluGrad(TestCase):
 
 instantiate_device_type_tests(TestGluGrad, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:1")
     run_tests()
diff --git a/test/test_npu/test_grid_sampler_2d.py b/test/test_npu/test_network_ops/test_grid_sampler_2d.py
similarity index 98%
rename from test/test_npu/test_grid_sampler_2d.py
rename to test/test_npu/test_network_ops/test_grid_sampler_2d.py
index 655f548aedc5496ac84c7d8b5bf2f77d0561df75..50d6ed9bf0e3628718b06058e7790d2fb77516cf 100644
--- a/test/test_npu/test_grid_sampler_2d.py
+++ b/test/test_npu/test_network_ops/test_grid_sampler_2d.py
@@ -70,5 +70,4 @@ class TestGridSampler2D(TestCase):
 
 instantiate_device_type_tests(TestGridSampler2D, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_grid_sampler_2d_backward.py b/test/test_npu/test_network_ops/test_grid_sampler_2d_backward.py
similarity index 98%
rename from test/test_npu/test_grid_sampler_2d_backward.py
rename to test/test_npu/test_network_ops/test_grid_sampler_2d_backward.py
index f5ca5d00307c39de699376b92b643639ff59ba97..0cbfb2721e358a4313abdbcbf7c70f6ac0aca2cc 100644
--- a/test/test_npu/test_grid_sampler_2d_backward.py
+++ b/test/test_npu/test_network_ops/test_grid_sampler_2d_backward.py
@@ -74,5 +74,4 @@ class TestGridSampler2dBackward(TestCase):
 
 instantiate_device_type_tests(TestGridSampler2dBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:4")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_group_norm.py b/test/test_npu/test_network_ops/test_group_norm.py
similarity index 99%
rename from test/test_npu/test_group_norm.py
rename to test/test_npu/test_network_ops/test_group_norm.py
index 3a326b779bba49b2dad7aa7ea0c4ad2983726203..d6c1bd1029bc1e0175303f2473851e90e21ee86c 100644
--- a/test/test_npu/test_group_norm.py
+++ b/test/test_npu/test_network_ops/test_group_norm.py
@@ -129,5 +129,4 @@ class TestGroupNormExt(TestCase):
 
 instantiate_device_type_tests(TestGroupNormExt, globals(), except_for='cpu')
 if __name__ == '__main__':
-    torch.npu.set_device("npu:1")
     run_tests()
diff --git a/test/test_npu/test_hamming_window.py b/test/test_npu/test_network_ops/test_hamming_window.py
similarity index 99%
rename from test/test_npu/test_hamming_window.py
rename to test/test_npu/test_network_ops/test_hamming_window.py
index 490cf878cbf4d371ee973cc69a02dc9fb1eba8a5..da429c640ec0357c4e477c8ca1c4f9f96b1acf73 100644
--- a/test/test_npu/test_hamming_window.py
+++ b/test/test_npu/test_network_ops/test_hamming_window.py
@@ -130,5 +130,4 @@ class TestHammingWindow(TestCase):
 
 instantiate_device_type_tests(TestHammingWindow, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_hammingwindow.py b/test/test_npu/test_network_ops/test_hammingwindow.py
similarity index 100%
rename from test/test_npu/test_hammingwindow.py
rename to test/test_npu/test_network_ops/test_hammingwindow.py
diff --git a/test/test_npu/test_hanning_window.py b/test/test_npu/test_network_ops/test_hanning_window.py
similarity index 98%
rename from test/test_npu/test_hanning_window.py
rename to test/test_npu/test_network_ops/test_hanning_window.py
index 30fe1d86a03c78980c96b1d6c6da07df572f8736..16b15caa8687c244ebfb41e45f00119980276ad8 100644
--- a/test/test_npu/test_hanning_window.py
+++ b/test/test_npu/test_network_ops/test_hanning_window.py
@@ -83,5 +83,4 @@ class TestHannWindow(TestCase):
 
 instantiate_device_type_tests(TestHannWindow, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_hard_sigmoid_backward.py b/test/test_npu/test_network_ops/test_hard_sigmoid_backward.py
similarity index 100%
rename from test/test_npu/test_hard_sigmoid_backward.py
rename to test/test_npu/test_network_ops/test_hard_sigmoid_backward.py
diff --git a/test/test_npu/test_hardshrink.py b/test/test_npu/test_network_ops/test_hardshrink.py
similarity index 100%
rename from test/test_npu/test_hardshrink.py
rename to test/test_npu/test_network_ops/test_hardshrink.py
diff --git a/test/test_npu/test_hardshrink_backward.py b/test/test_npu/test_network_ops/test_hardshrink_backward.py
similarity index 100%
rename from test/test_npu/test_hardshrink_backward.py
rename to test/test_npu/test_network_ops/test_hardshrink_backward.py
diff --git a/test/test_npu/test_hardsigmoid.py b/test/test_npu/test_network_ops/test_hardsigmoid.py
similarity index 100%
rename from test/test_npu/test_hardsigmoid.py
rename to test/test_npu/test_network_ops/test_hardsigmoid.py
diff --git a/test/test_npu/test_hinge_embedding_loss.py b/test/test_npu/test_network_ops/test_hinge_embedding_loss.py
similarity index 100%
rename from test/test_npu/test_hinge_embedding_loss.py
rename to test/test_npu/test_network_ops/test_hinge_embedding_loss.py
diff --git a/test/test_npu/test_im2col.py b/test/test_npu/test_network_ops/test_im2col.py
similarity index 100%
rename from test/test_npu/test_im2col.py
rename to test/test_npu/test_network_ops/test_im2col.py
diff --git a/test/test_npu/test_index_fill_d.py b/test/test_npu/test_network_ops/test_index_fill_d.py
similarity index 100%
rename from test/test_npu/test_index_fill_d.py
rename to test/test_npu/test_network_ops/test_index_fill_d.py
diff --git a/test/test_npu/test_index_select.py b/test/test_npu/test_network_ops/test_index_select.py
similarity index 100%
rename from test/test_npu/test_index_select.py
rename to test/test_npu/test_network_ops/test_index_select.py
diff --git a/test/test_npu/test_isclose.py b/test/test_npu/test_network_ops/test_isclose.py
similarity index 100%
rename from test/test_npu/test_isclose.py
rename to test/test_npu/test_network_ops/test_isclose.py
diff --git a/test/test_npu/test_kthvalue.py b/test/test_npu/test_network_ops/test_kthvalue.py
similarity index 99%
rename from test/test_npu/test_kthvalue.py
rename to test/test_npu/test_network_ops/test_kthvalue.py
index 56841fc05d473e0e843204fa4bace92aeca69ac1..35a4a4a9c53731c99cf837b8fcd54819f7607e95 100644
--- a/test/test_npu/test_kthvalue.py
+++ b/test/test_npu/test_network_ops/test_kthvalue.py
@@ -177,7 +177,6 @@ class TestKthvalues(TestCase):
 
 instantiate_device_type_tests(TestKthvalues, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0") 
     run_tests()
 
     
diff --git a/test/test_npu/test_l1_loss.py b/test/test_npu/test_network_ops/test_l1_loss.py
similarity index 100%
rename from test/test_npu/test_l1_loss.py
rename to test/test_npu/test_network_ops/test_l1_loss.py
diff --git a/test/test_npu/test_l1_loss_backward.py b/test/test_npu/test_network_ops/test_l1_loss_backward.py
similarity index 100%
rename from test/test_npu/test_l1_loss_backward.py
rename to test/test_npu/test_network_ops/test_l1_loss_backward.py
diff --git a/test/test_npu/test_leaky_relu.py b/test/test_npu/test_network_ops/test_leaky_relu.py
similarity index 99%
rename from test/test_npu/test_leaky_relu.py
rename to test/test_npu/test_network_ops/test_leaky_relu.py
index 78de88510ea2ccdacfa82a73d31810cba8b4cf94..67f2bc7fd4434312f676369416c805e85f7b11c4 100644
--- a/test/test_npu/test_leaky_relu.py
+++ b/test/test_npu/test_network_ops/test_leaky_relu.py
@@ -109,7 +109,6 @@ class TestLeakRelu(TestCase):
 
 instantiate_device_type_tests(TestLeakRelu, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
 
 
diff --git a/test/test_npu/test_network_ops/test_log10.py b/test/test_npu/test_network_ops/test_log10.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3cc4226ae48d2e5aeaecbdb5e3ad7ddfd28804c
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_log10.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLog10(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.log10(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.log10(input1)
+        output = output.to("cpu").numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        torch.log10(input1, out=input2)
+        output = input2.to("cpu").numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        output = torch.log10_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        torch.log10_(input1)
+        output = input1.to("cpu").numpy()
+        return output
+
+    def cpu_inp_uncon_op_exec(self, input1):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.log10_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_uncon_op_exec(self, input1):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        torch.log10_(input1)
+        output = input1.to("cpu").numpy()
+        return output
+
+    def test_log10_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_uncon_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_uncon_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_out_float32_shape_format(self, device):
+        shape_format = [
+            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
+            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
+            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
+            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
+            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
+            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec_out(npu_input, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_out_float16_shape_format(self, device):
+        shape_format = [
+            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
+            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
+            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
+            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
+            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
+            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
+            if item[0][0] == np.float16:
+                cpu_input = cpu_input.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec_out(npu_input, npu_output)
+            if item[0][0] == np.float16:
+                cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestLog10, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_log1p.py b/test/test_npu/test_network_ops/test_log1p.py
index 40486f60d97352ad66283198d742bf017dae3730..75f4c7ac88fbe4e40ebb2cfe8212a189df908c4b 100644
--- a/test/test_npu/test_network_ops/test_log1p.py
+++ b/test/test_npu/test_network_ops/test_log1p.py
@@ -92,6 +92,5 @@ class TestLog1p(TestCase):
 
 instantiate_device_type_tests(TestLog1p, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
         
diff --git a/test/test_npu/test_log_sigmoid.py b/test/test_npu/test_network_ops/test_log_sigmoid.py
similarity index 98%
rename from test/test_npu/test_log_sigmoid.py
rename to test/test_npu/test_network_ops/test_log_sigmoid.py
index b039f77522aeb20df58e0097b9e023bb6e31e445..e2c0ffdfeac1a7472508a4c792a594543b81935f 100644
--- a/test/test_npu/test_log_sigmoid.py
+++ b/test/test_npu/test_network_ops/test_log_sigmoid.py
@@ -70,5 +70,4 @@ class TestLogsigmoid(TestCase):
 
 instantiate_device_type_tests(TestLogsigmoid, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
diff --git a/test/test_npu/test_log_sigmoid_backward.py b/test/test_npu/test_network_ops/test_log_sigmoid_backward.py
similarity index 98%
rename from test/test_npu/test_log_sigmoid_backward.py
rename to test/test_npu/test_network_ops/test_log_sigmoid_backward.py
index b94e607a2eb2f750166e174cf000974583b61d30..63ae01b4ff0cdde70005886ca5199a724e2f1c75 100644
--- a/test/test_npu/test_log_sigmoid_backward.py
+++ b/test/test_npu/test_network_ops/test_log_sigmoid_backward.py
@@ -92,5 +92,4 @@ class TestLogSigmoidBackward(TestCase):
 instantiate_device_type_tests(
     TestLogSigmoidBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
diff --git a/test/test_npu/test_logical_not.py b/test/test_npu/test_network_ops/test_logical_not.py
similarity index 98%
rename from test/test_npu/test_logical_not.py
rename to test/test_npu/test_network_ops/test_logical_not.py
index 865cdf073a66a280cff6159a4af8e1036f81b4ea..031e206dd2494e900a1341b88404aeacaced5183 100644
--- a/test/test_npu/test_logical_not.py
+++ b/test/test_npu/test_network_ops/test_logical_not.py
@@ -63,6 +63,5 @@ class TestLogicalNot(TestCase):
 
 instantiate_device_type_tests(TestLogicalNot, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
         
diff --git a/test/test_npu/test_logsigmoid.py b/test/test_npu/test_network_ops/test_logsigmoid.py
similarity index 98%
rename from test/test_npu/test_logsigmoid.py
rename to test/test_npu/test_network_ops/test_logsigmoid.py
index d7732766935feb421513a54b5b8b95db61d9245a..6c02c6fa756b136771672e82b93a0ef35118184a 100644
--- a/test/test_npu/test_logsigmoid.py
+++ b/test/test_npu/test_network_ops/test_logsigmoid.py
@@ -53,5 +53,4 @@ class TestLogsigmoid(TestCase):
 
 instantiate_device_type_tests(TestLogsigmoid, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_logsigmoidforward.py b/test/test_npu/test_network_ops/test_logsigmoidforward.py
similarity index 98%
rename from test/test_npu/test_logsigmoidforward.py
rename to test/test_npu/test_network_ops/test_logsigmoidforward.py
index ae072d622a2371fa3d7c32ac3f296556aa2311fc..ae1bd6ade46748a1311ee5d28188dd5293283ae1 100644
--- a/test/test_npu/test_logsigmoidforward.py
+++ b/test/test_npu/test_network_ops/test_logsigmoidforward.py
@@ -67,5 +67,4 @@ class TestLogsigmoidForward(TestCase):
 
 instantiate_device_type_tests(TestLogsigmoidForward, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_logsumexp.py b/test/test_npu/test_network_ops/test_logsumexp.py
similarity index 100%
rename from test/test_npu/test_logsumexp.py
rename to test/test_npu/test_network_ops/test_logsumexp.py
diff --git a/test/test_npu/test_network_ops/test_masked_fill_range.py b/test/test_npu/test_network_ops/test_masked_fill_range.py
new file mode 100644
index 0000000000000000000000000000000000000000..69cd04dc1ff98263d0f1da7df4a3920b4463564d
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_masked_fill_range.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch.cuda import device
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestMaskedFillRange(TestCase):
+    def cpu_op_exec(self, input1, start, end, value, axis, dim):
+        out = input1.clone()
+        start_shape = start.shape
+        if dim == 1:
+            for i in range(0, start_shape[0]):
+                for j in range(0, start_shape[1]):
+                    for k in range(start[i, j], end[i, j]):
+                        out[k] = value[i]
+        if dim == 2:
+            for i in range(0, start_shape[0]):
+                for j in range(0, start_shape[1]):
+                    for k in range(start[i, j], end[i, j]):
+                        if axis == 0:
+                            out[k, :] = value[i]
+                        else:
+                            out[j, k] = value[i]
+        if dim == 3:
+            for i in range(0, start_shape[0]):
+                for j in range(0, start_shape[1]):
+                    for k in range(start[i, j], end[i, j]):
+                        if axis == 0:
+                            out[k, :, :] = value[i]
+                        elif axis == 1:
+                            out[:, k, :] = value[i]
+                        else:
+                            out[j, :, k] = value[i]
+        return out
+
+    def npu_op_exec(self, input1, start, end, value, axis):
+        out = torch.npu_masked_fill_range(input1, start, end, value, axis)
+        out = out.to("cpu")
+        return out.detach().numpy()
+
+    def test_normalize_batch(self, device):
+        # TODO(ascend): 该算子还存在泛化问题， 目前保证模型场景没问题
+        # Note: 以下为模型用例：测试通过
+        shape_format = [
+            [[np.float32, -1, [32, 64, 1688]], 
+                [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                    14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
+                [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 
+                    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.float32], 2],
+            [[np.float16, -1, [32, 64, 1688]], 
+                [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                    14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
+                [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 
+                    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.float16], 2],
+            [[np.int32, -1, [32, 64, 1688]], 
+                [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                    14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
+                [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 
+                    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.int32], 2],
+            [[np.int8, -1, [32, 64, 1688]], 
+                [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                    14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
+                [[6, 7, 31, 9, 10, 11, 12, 19, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 
+                    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]], [[1], torch.int8], 2],
+        ]
+        for item in shape_format:
+            axis = item[-1]
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            shape = item[0][-1]
+            cpu_start = torch.tensor(item[1], dtype=torch.int32)
+            npu_start = cpu_start.npu()
+            cpu_end = torch.tensor(item[2], dtype=torch.int32)
+            npu_end = cpu_end.npu()
+            cpu_value = torch.tensor(item[3][0], dtype=item[3][1])
+            npu_value = cpu_value.npu()
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_start, cpu_end, cpu_value, axis, len(shape))
+            npu_output = self.npu_op_exec(npu_input1, npu_start, npu_end, npu_value, axis)
+            cpu_output = cpu_output.numpy()
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestMaskedFillRange, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_max_pool2d_backward.py b/test/test_npu/test_network_ops/test_max_pool2d_backward.py
similarity index 100%
rename from test/test_npu/test_max_pool2d_backward.py
rename to test/test_npu/test_network_ops/test_max_pool2d_backward.py
diff --git a/test/test_npu/test_miopen_batch_norm.py b/test/test_npu/test_network_ops/test_miopen_batch_norm.py
similarity index 100%
rename from test/test_npu/test_miopen_batch_norm.py
rename to test/test_npu/test_network_ops/test_miopen_batch_norm.py
diff --git a/test/test_npu/test_miopen_batch_norm_backward.py b/test/test_npu/test_network_ops/test_miopen_batch_norm_backward.py
similarity index 100%
rename from test/test_npu/test_miopen_batch_norm_backward.py
rename to test/test_npu/test_network_ops/test_miopen_batch_norm_backward.py
diff --git a/test/test_npu/test_miopen_convolution.py b/test/test_npu/test_network_ops/test_miopen_convolution.py
similarity index 99%
rename from test/test_npu/test_miopen_convolution.py
rename to test/test_npu/test_network_ops/test_miopen_convolution.py
index 8583f61d49fd4ab3d414a4392b4c0114a76b8c79..0e8267c72ead13e376dc62a6fa6c0b0f004b2872 100644
--- a/test/test_npu/test_miopen_convolution.py
+++ b/test/test_npu/test_network_ops/test_miopen_convolution.py
@@ -77,5 +77,4 @@ class TestMiopenConvolution(TestCase):
 
 instantiate_device_type_tests(TestMiopenConvolution, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_miopen_convolution_backward.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward.py
similarity index 99%
rename from test/test_npu/test_miopen_convolution_backward.py
rename to test/test_npu/test_network_ops/test_miopen_convolution_backward.py
index 0aaa54c061bf2e3fd3d4d38412674c1432ea7b1b..07a4d88d2ef2efc3e4635743fcf9cf8160ba8e63 100644
--- a/test/test_npu/test_miopen_convolution_backward.py
+++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward.py
@@ -120,5 +120,4 @@ class TestMiopenConvolutionBackward(TestCase):
     
 instantiate_device_type_tests(TestMiopenConvolutionBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_miopen_convolution_backward_bias.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward_bias.py
similarity index 99%
rename from test/test_npu/test_miopen_convolution_backward_bias.py
rename to test/test_npu/test_network_ops/test_miopen_convolution_backward_bias.py
index 00259de601a33c79e18a527dc1c503c3950844d6..8d92d5f9b6d0559f46aa500e799ff216087d8a3b 100644
--- a/test/test_npu/test_miopen_convolution_backward_bias.py
+++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward_bias.py
@@ -114,5 +114,4 @@ class TestMiopenConvolutionBackwardBias(TestCase):
 
 instantiate_device_type_tests(TestMiopenConvolutionBackwardBias, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_miopen_convolution_backward_input.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward_input.py
similarity index 99%
rename from test/test_npu/test_miopen_convolution_backward_input.py
rename to test/test_npu/test_network_ops/test_miopen_convolution_backward_input.py
index 63e1282dc7d48f0ef944129637c947bfc0fb70a2..94da01163d9a15ae4e2294ac92c9f5e451f72d55 100644
--- a/test/test_npu/test_miopen_convolution_backward_input.py
+++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward_input.py
@@ -114,5 +114,4 @@ class TestMiopenConvolutionBackwardInput(TestCase):
 
 instantiate_device_type_tests(TestMiopenConvolutionBackwardInput, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_miopen_convolution_backward_weight.py b/test/test_npu/test_network_ops/test_miopen_convolution_backward_weight.py
similarity index 99%
rename from test/test_npu/test_miopen_convolution_backward_weight.py
rename to test/test_npu/test_network_ops/test_miopen_convolution_backward_weight.py
index 64dca8cbff986c77839ed5f14726bfcf44cc0f37..4943a6efe47472bdf17da761ae69050785ecba2f 100644
--- a/test/test_npu/test_miopen_convolution_backward_weight.py
+++ b/test/test_npu/test_network_ops/test_miopen_convolution_backward_weight.py
@@ -115,6 +115,5 @@ class TestMiopenConvolutionBackwardWeight(TestCase):
 
 instantiate_device_type_tests(TestMiopenConvolutionBackwardWeight, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
 
diff --git a/test/test_npu/test_mkldnn_adaptive_avg_pool2d.py b/test/test_npu/test_network_ops/test_mkldnn_adaptive_avg_pool2d.py
similarity index 98%
rename from test/test_npu/test_mkldnn_adaptive_avg_pool2d.py
rename to test/test_npu/test_network_ops/test_mkldnn_adaptive_avg_pool2d.py
index 554f8ba38c182e3eb69a705aa957493f310a35a9..c0edc4c37db7c50a9dc1322944d0f880b7b09de0 100644
--- a/test/test_npu/test_mkldnn_adaptive_avg_pool2d.py
+++ b/test/test_npu/test_network_ops/test_mkldnn_adaptive_avg_pool2d.py
@@ -77,5 +77,4 @@ class TestMkldnnAdaptiveAvgPool2d(TestCase):
 
 instantiate_device_type_tests(TestMkldnnAdaptiveAvgPool2d, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_mkldnn_convolution_backward.py b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward.py
similarity index 99%
rename from test/test_npu/test_mkldnn_convolution_backward.py
rename to test/test_npu/test_network_ops/test_mkldnn_convolution_backward.py
index 8de7467f58f72343da91e45ee262eb460d63e7c6..f298d09fd973677689b0b0068fa8802ad23aafd0 100644
--- a/test/test_npu/test_mkldnn_convolution_backward.py
+++ b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward.py
@@ -159,5 +159,4 @@ class TestMkldnnConvolutionBackward(TestCase):
     
 instantiate_device_type_tests(TestMkldnnConvolutionBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_mkldnn_convolution_backward_input.py b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_input.py
similarity index 99%
rename from test/test_npu/test_mkldnn_convolution_backward_input.py
rename to test/test_npu/test_network_ops/test_mkldnn_convolution_backward_input.py
index 7a90b52bc48ff601bbc6b33097469142d8df1104..7ee1961f5e1c958bad399c1de92c9cf38e8d6f1f 100644
--- a/test/test_npu/test_mkldnn_convolution_backward_input.py
+++ b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_input.py
@@ -150,5 +150,4 @@ class TestMkldnnConvolutionBackwardInput(TestCase):
     
 instantiate_device_type_tests(TestMkldnnConvolutionBackwardInput, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
diff --git a/test/test_npu/test_mkldnn_convolution_backward_weights.py b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_weights.py
similarity index 99%
rename from test/test_npu/test_mkldnn_convolution_backward_weights.py
rename to test/test_npu/test_network_ops/test_mkldnn_convolution_backward_weights.py
index 5bf471a52c59994e0667e4416771282d0129a405..65e0bf8abda4b4a2fc9b659eefb115eb0cd66183 100644
--- a/test/test_npu/test_mkldnn_convolution_backward_weights.py
+++ b/test/test_npu/test_network_ops/test_mkldnn_convolution_backward_weights.py
@@ -157,5 +157,4 @@ class TestMkldnnConvolutionBackwardWeights(TestCase):
     
 instantiate_device_type_tests(TestMkldnnConvolutionBackwardWeights, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_multilabel_margin_loss.py b/test/test_npu/test_network_ops/test_multilabel_margin_loss.py
similarity index 100%
rename from test/test_npu/test_multilabel_margin_loss.py
rename to test/test_npu/test_network_ops/test_multilabel_margin_loss.py
diff --git a/test/test_npu/test_multinomial.py b/test/test_npu/test_network_ops/test_multinomial.py
similarity index 100%
rename from test/test_npu/test_multinomial.py
rename to test/test_npu/test_network_ops/test_multinomial.py
diff --git a/test/test_npu/test_narrow_copy.py b/test/test_npu/test_network_ops/test_narrow_copy.py
similarity index 100%
rename from test/test_npu/test_narrow_copy.py
rename to test/test_npu/test_network_ops/test_narrow_copy.py
diff --git a/test/test_npu/test_nllloss2d.py b/test/test_npu/test_network_ops/test_nllloss2d.py
similarity index 100%
rename from test/test_npu/test_nllloss2d.py
rename to test/test_npu/test_network_ops/test_nllloss2d.py
diff --git a/test/test_npu/test_norm_except_dim.py b/test/test_npu/test_network_ops/test_norm_except_dim.py
similarity index 96%
rename from test/test_npu/test_norm_except_dim.py
rename to test/test_npu/test_network_ops/test_norm_except_dim.py
index c1555ee23a99d961b756563b8f23a0320296c34d..f75c7cc6ef0dbe6b68f12f1c5d1db311a4cff1d4 100644
--- a/test/test_npu/test_norm_except_dim.py
+++ b/test/test_npu/test_network_ops/test_norm_except_dim.py
@@ -17,7 +17,7 @@ import numpy as np
 import math
 import random
 from torch._six import nan
-from common_utils import TestCase, iter_indices, run_tests
+from common_utils import TestCase, run_tests
 from common_device_type import dtypes, instantiate_device_type_tests
 
 
@@ -73,5 +73,4 @@ class TestNormExceptDim(TestCase):
 instantiate_device_type_tests(TestNormExceptDim, globals(), except_for="cpu")
 
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_norm_ext.py b/test/test_npu/test_network_ops/test_norm_ext.py
similarity index 96%
rename from test/test_npu/test_norm_ext.py
rename to test/test_npu/test_network_ops/test_norm_ext.py
index bf3aac19f9f8c1ab8e7882d3733448f75296582e..8e5e51224cf3493e38b3b1e0fb40300874c678eb 100644
--- a/test/test_npu/test_norm_ext.py
+++ b/test/test_npu/test_network_ops/test_norm_ext.py
@@ -73,7 +73,7 @@ class TestNorm(TestCase):
             
     def test_norm_shape_format_2(self, device):
         shape_format = [
-                [[np.float16, 0, (12, 33)]],
+                # [[np.float16, 0, (12, 33)]],  # result error
                 [[np.float32, 0, (12, 33)]],
         ] 
         for item in shape_format:
@@ -82,12 +82,12 @@ class TestNorm(TestCase):
                 cpu_input = cpu_input.to(torch.float32)
             cpu_output = self.cpu_out_exec(cpu_input, 2, [0], False, torch.float)
             npu_output = self.npu_out_exec(npu_input, 2, [0], False, torch.float)
-            cpu_output = cpu_output.to(npu_output.dtype)
+            npu_output = npu_output.to(cpu_output.dtype)
             self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
             
     def test_norm_shape_format_3(self, device):
         shape_format = [
-                [[np.float16, 0, (10, 24, 56, 2048)]],
+                # [[np.float16, 0, (10, 24, 56, 2048)]], # result error
                 [[np.float32, 0, (10, 24, 56, 2048)]],
         ] 
         for item in shape_format:
@@ -127,5 +127,4 @@ class TestNorm(TestCase):
      
 instantiate_device_type_tests(TestNorm, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_one_hot.py b/test/test_npu/test_network_ops/test_one_hot.py
similarity index 98%
rename from test/test_npu/test_one_hot.py
rename to test/test_npu/test_network_ops/test_one_hot.py
index f9d69381841b95c917f9e0d48e7930aa9d7231ce..ce72ccb38b2333f5501d65681e7cacdc5897bce5 100644
--- a/test/test_npu/test_one_hot.py
+++ b/test/test_npu/test_network_ops/test_one_hot.py
@@ -89,6 +89,5 @@ class TestOneHot(TestCase):
 
 instantiate_device_type_tests(TestOneHot, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
     run_tests()
 
diff --git a/test/test_npu/test_ones.py b/test/test_npu/test_network_ops/test_ones.py
similarity index 100%
rename from test/test_npu/test_ones.py
rename to test/test_npu/test_network_ops/test_ones.py
diff --git a/test/test_npu/test_pdist.py b/test/test_npu/test_network_ops/test_pdist.py
similarity index 99%
rename from test/test_npu/test_pdist.py
rename to test/test_npu/test_network_ops/test_pdist.py
index 6fc2dd65a601d05dfe48a57507179fa6ef4c6a19..03e2fbfe1daafd1461fc0c669bbad4eccc173db8 100644
--- a/test/test_npu/test_pdist.py
+++ b/test/test_npu/test_network_ops/test_pdist.py
@@ -139,5 +139,4 @@ class TestPdist(TestCase):
 
 instantiate_device_type_tests(TestPdist, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_pixel_shuffle.py b/test/test_npu/test_network_ops/test_pixel_shuffle.py
similarity index 99%
rename from test/test_npu/test_pixel_shuffle.py
rename to test/test_npu/test_network_ops/test_pixel_shuffle.py
index fa35bae0802c0fa438ff28b1c59f9a2bf5cec410..281f10e3c6cac75b9d4344bff34d4e0c980a0e9e 100644
--- a/test/test_npu/test_pixel_shuffle.py
+++ b/test/test_npu/test_network_ops/test_pixel_shuffle.py
@@ -91,5 +91,4 @@ class TestPixel_shuffle(TestCase):
 
 instantiate_device_type_tests(TestPixel_shuffle, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:7")
     run_tests()
diff --git a/test/test_npu/test_network_ops/test_pooling.py b/test/test_npu/test_network_ops/test_pooling.py
index fd8cd7d0daf17cb6c0cc75ef09e81de8db1ea5dc..9aa54b3839f98e9ef37acdbf3e72599be44fa8ba 100644
--- a/test/test_npu/test_network_ops/test_pooling.py
+++ b/test/test_npu/test_network_ops/test_pooling.py
@@ -53,5 +53,4 @@ class TestPooling(TestCase):
         
 instantiate_device_type_tests(TestPooling, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
diff --git a/test/test_npu/test_prelu.py b/test/test_npu/test_network_ops/test_prelu.py
similarity index 98%
rename from test/test_npu/test_prelu.py
rename to test/test_npu/test_network_ops/test_prelu.py
index 9b4079dd87edf26e113352347e47cc3945414008..da643db7ad860f440952f2d55f63ccd5da4827d0 100644
--- a/test/test_npu/test_prelu.py
+++ b/test/test_npu/test_network_ops/test_prelu.py
@@ -57,5 +57,4 @@ class TestPrelu(TestCase):
 
 instantiate_device_type_tests(TestPrelu, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_prelu_backward.py b/test/test_npu/test_network_ops/test_prelu_backward.py
similarity index 99%
rename from test/test_npu/test_prelu_backward.py
rename to test/test_npu/test_network_ops/test_prelu_backward.py
index d058a0616587b197ffb7cfd023332325cedcc7ed..07a5e9d64859b7d1412fffc38c6a86451d8c08d2 100644
--- a/test/test_npu/test_prelu_backward.py
+++ b/test/test_npu/test_network_ops/test_prelu_backward.py
@@ -88,5 +88,4 @@ class TestPreluBackward(TestCase):
     
 instantiate_device_type_tests(TestPreluBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_quantize_per_channel.py b/test/test_npu/test_network_ops/test_quantize_per_channel.py
similarity index 100%
rename from test/test_npu/test_quantize_per_channel.py
rename to test/test_npu/test_network_ops/test_quantize_per_channel.py
diff --git a/test/test_npu/test_quantize_per_tensor.py b/test/test_npu/test_network_ops/test_quantize_per_tensor.py
similarity index 100%
rename from test/test_npu/test_quantize_per_tensor.py
rename to test/test_npu/test_network_ops/test_quantize_per_tensor.py
diff --git a/test/test_npu/test_real.py b/test/test_npu/test_network_ops/test_real.py
similarity index 100%
rename from test/test_npu/test_real.py
rename to test/test_npu/test_network_ops/test_real.py
diff --git a/test/test_npu/test_renorm.py b/test/test_npu/test_network_ops/test_renorm.py
similarity index 99%
rename from test/test_npu/test_renorm.py
rename to test/test_npu/test_network_ops/test_renorm.py
index a1c258f913ab5b59e839a20f7cbcfcf9d92f73d7..13cedf07d8effbd73a16410582b2e0bae1bfe8f9 100644
--- a/test/test_npu/test_renorm.py
+++ b/test/test_npu/test_network_ops/test_renorm.py
@@ -269,5 +269,4 @@ class TestRenorm(TestCase):
     
 instantiate_device_type_tests(TestRenorm, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
     run_tests()
diff --git a/test/test_npu/test_repeat_interleave.py b/test/test_npu/test_network_ops/test_repeat_interleave.py
similarity index 100%
rename from test/test_npu/test_repeat_interleave.py
rename to test/test_npu/test_network_ops/test_repeat_interleave.py
diff --git a/test/test_npu/test_network_ops/test_resize_.py b/test/test_npu/test_network_ops/test_resize_.py
index b525ead0055c2fca80e9587ee003dfc18397cb04..bafaff5300e2777698a8fbc57f1e899c1f305bd9 100644
--- a/test/test_npu/test_network_ops/test_resize_.py
+++ b/test/test_npu/test_network_ops/test_resize_.py
@@ -66,5 +66,4 @@ class TestResize(TestCase):
 
 instantiate_device_type_tests(TestResize, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_roll.py b/test/test_npu/test_network_ops/test_roll.py
similarity index 100%
rename from test/test_npu/test_roll.py
rename to test/test_npu/test_network_ops/test_roll.py
diff --git a/test/test_npu/test_selu.py b/test/test_npu/test_network_ops/test_selu.py
similarity index 100%
rename from test/test_npu/test_selu.py
rename to test/test_npu/test_network_ops/test_selu.py
diff --git a/test/test_npu/test_sinh.py b/test/test_npu/test_network_ops/test_sinh.py
similarity index 100%
rename from test/test_npu/test_sinh.py
rename to test/test_npu/test_network_ops/test_sinh.py
diff --git a/test/test_npu/test_slow_conv_dilated2d.py b/test/test_npu/test_network_ops/test_slow_conv_dilated2d.py
similarity index 100%
rename from test/test_npu/test_slow_conv_dilated2d.py
rename to test/test_npu/test_network_ops/test_slow_conv_dilated2d.py
diff --git a/test/test_npu/test_slow_conv_dilated2d_backward.py b/test/test_npu/test_network_ops/test_slow_conv_dilated2d_backward.py
similarity index 100%
rename from test/test_npu/test_slow_conv_dilated2d_backward.py
rename to test/test_npu/test_network_ops/test_slow_conv_dilated2d_backward.py
diff --git a/test/test_npu/test_slow_conv_transpose2d.py b/test/test_npu/test_network_ops/test_slow_conv_transpose2d.py
similarity index 100%
rename from test/test_npu/test_slow_conv_transpose2d.py
rename to test/test_npu/test_network_ops/test_slow_conv_transpose2d.py
diff --git a/test/test_npu/test_slow_conv_transpose2d_backward.py b/test/test_npu/test_network_ops/test_slow_conv_transpose2d_backward.py
similarity index 100%
rename from test/test_npu/test_slow_conv_transpose2d_backward.py
rename to test/test_npu/test_network_ops/test_slow_conv_transpose2d_backward.py
diff --git a/test/test_npu/test_slow_conv_transpose3d.py b/test/test_npu/test_network_ops/test_slow_conv_transpose3d.py
similarity index 98%
rename from test/test_npu/test_slow_conv_transpose3d.py
rename to test/test_npu/test_network_ops/test_slow_conv_transpose3d.py
index ca8bf35b8c3a2ad45dd5db340e6ed6ffcd66d648..be25249f9bba4b7985f7171f29f3bc74f62031d9 100644
--- a/test/test_npu/test_slow_conv_transpose3d.py
+++ b/test/test_npu/test_network_ops/test_slow_conv_transpose3d.py
@@ -56,5 +56,4 @@ class TestSlowConvTranspose3d(TestCase):
 
 instantiate_device_type_tests(TestSlowConvTranspose3d, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_soft_margin_loss.py b/test/test_npu/test_network_ops/test_soft_margin_loss.py
similarity index 96%
rename from test/test_npu/test_soft_margin_loss.py
rename to test/test_npu/test_network_ops/test_soft_margin_loss.py
index fabfe9147f61d737dd5c5f1b994c87f928d4a4d8..a83172e56db3bd4c6b6247a2b622b1f03bd277c0 100644
--- a/test/test_npu/test_soft_margin_loss.py
+++ b/test/test_npu/test_network_ops/test_soft_margin_loss.py
@@ -111,8 +111,8 @@ class TestSoftMarginLoss(TestCase):
 
     def test_soft_margin_loss_float32_none(self, device):
         npu_input1, npu_input2 = self.generate_data(-2, 2, (25, 25, 25), (25, 1, 25), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum")
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum")
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "none")
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, "none")
         self.assertRtolEqual(cpu_output, npu_output)
 
     def test_soft_margin_loss_float32_sum(self, device):
@@ -123,5 +123,4 @@ class TestSoftMarginLoss(TestCase):
 
 instantiate_device_type_tests(TestSoftMarginLoss, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/test/test_npu/test_softmax_backward.py b/test/test_npu/test_network_ops/test_softmax_backward.py
similarity index 100%
rename from test/test_npu/test_softmax_backward.py
rename to test/test_npu/test_network_ops/test_softmax_backward.py
diff --git a/test/test_npu/test_softshrink.py b/test/test_npu/test_network_ops/test_softshrink.py
similarity index 99%
rename from test/test_npu/test_softshrink.py
rename to test/test_npu/test_network_ops/test_softshrink.py
index 7bbb839f9ea5b317fd9090b6e211006891a7b21b..601bad8486f94b8ca8bfe0d4d736ab27ad96cbfe 100644
--- a/test/test_npu/test_softshrink.py
+++ b/test/test_npu/test_network_ops/test_softshrink.py
@@ -100,5 +100,4 @@ class TestSoftShrink(TestCase):
         
 instantiate_device_type_tests(TestSoftShrink, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_softshrink_backward.py b/test/test_npu/test_network_ops/test_softshrink_backward.py
similarity index 98%
rename from test/test_npu/test_softshrink_backward.py
rename to test/test_npu/test_network_ops/test_softshrink_backward.py
index 0681f8658f24c4d7c98ba838d656619c51e6ec3b..5abd88823ac72648f427f0b54ac76dd00450c1ff 100644
--- a/test/test_npu/test_softshrink_backward.py
+++ b/test/test_npu/test_network_ops/test_softshrink_backward.py
@@ -74,5 +74,4 @@ class TestSoftShrinkBackward(TestCase):
         
 instantiate_device_type_tests(TestSoftShrinkBackward, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_split_with_sizes.py b/test/test_npu/test_network_ops/test_split_with_sizes.py
similarity index 99%
rename from test/test_npu/test_split_with_sizes.py
rename to test/test_npu/test_network_ops/test_split_with_sizes.py
index 6cae3f107c80eab6a42c54b692629b1c4637fddb..d5226c97411f131e717abc22e12c0b3d226b34e1 100644
--- a/test/test_npu/test_split_with_sizes.py
+++ b/test/test_npu/test_network_ops/test_split_with_sizes.py
@@ -79,5 +79,4 @@ class Test_split_with_sizes(TestCase):
 
 instantiate_device_type_tests(Test_split_with_sizes, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
diff --git a/test/test_npu/test_square.py b/test/test_npu/test_network_ops/test_square.py
similarity index 100%
rename from test/test_npu/test_square.py
rename to test/test_npu/test_network_ops/test_square.py
diff --git a/test/test_npu/test_network_ops/test_std.py b/test/test_npu/test_network_ops/test_std.py
index 2d5b70442d1559697662568f1bdf44eb87f0c806..dc04ae778a9185bad8d796f9ba01a3cf49767af7 100644
--- a/test/test_npu/test_network_ops/test_std.py
+++ b/test/test_npu/test_network_ops/test_std.py
@@ -16,7 +16,7 @@
 
 import torch
 import numpy as np
-
+import random
 from common_utils import TestCase, run_tests
 from common_device_type import dtypes, instantiate_device_type_tests
 from util_test import create_common_tensor
@@ -159,6 +159,17 @@ class TestStd(TestCase):
                 cpu_output1 = cpu_output1.astype(np.float16)
             self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002)
 
+            random_outputshape = [random.randint(1, 100)]
+            cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape,item[1],item[0])
+            if item[0] == np.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5])
+            npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
+            if item[0] == np.float16:
+                cpu_output1 = cpu_output1.astype(np.float16)
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002)
+
     def test_std_dim_out_shape_format_fp32(self, device):
         format_list = [0]
         shape_list = [[1024], [32, 24], [32, 8, 24], [12, 32, 8, 24]]
@@ -177,6 +188,12 @@ class TestStd(TestCase):
             npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
             self.assertRtolEqual(cpu_output1, npu_output1)
 
+            random_outputshape = [random.randint(1, 100)]
+            cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape, item[1], item[0])
+            cpu_output2 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output.clone(), item[4], item[5])
+            npu_output2 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
     def test_std_dim_name_fp16(self, device):
         shape = (1024, 8, 32)
         cpu_input = torch.rand(shape, dtype=torch.float32)
@@ -229,6 +246,40 @@ class TestStd(TestCase):
         npu_output = torch.std(npu_input, dim=dim,out=npu_output)
         self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy())
     
+    def test_std_n_dim_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[128, 32, 8, 1023]]
+        dim_list = [(3, 1)]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            cpu_output1 = cpu_output1.astype(np.float16)
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.003)
+
+    def test_std_n_dim_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[128, 32, 8, 1023]]
+        dim_list = [(3, 1)]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float32, i, j, k, l, m] for i in format_list for j in shape_list
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1)
+    
 instantiate_device_type_tests(TestStd, globals(), except_for="cpu")
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_npu/test_sum_to_size.py b/test/test_npu/test_network_ops/test_sum_to_size.py
similarity index 98%
rename from test/test_npu/test_sum_to_size.py
rename to test/test_npu/test_network_ops/test_sum_to_size.py
index 1820b9d95962034f273cd50a307018c4701c8374..b7316671611250cc4b845047f00f4825fda60034 100644
--- a/test/test_npu/test_sum_to_size.py
+++ b/test/test_npu/test_network_ops/test_sum_to_size.py
@@ -64,5 +64,4 @@ class TestSumToSize(TestCase):
 
 instantiate_device_type_tests(TestSumToSize, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
diff --git a/test/test_npu/test_take.py b/test/test_npu/test_network_ops/test_take.py
similarity index 100%
rename from test/test_npu/test_take.py
rename to test/test_npu/test_network_ops/test_take.py
diff --git a/test/test_npu/test_tan.py b/test/test_npu/test_network_ops/test_tan.py
similarity index 100%
rename from test/test_npu/test_tan.py
rename to test/test_npu/test_network_ops/test_tan.py
diff --git a/test/test_npu/test_tensor_npu.py b/test/test_npu/test_network_ops/test_tensor_npu.py
similarity index 84%
rename from test/test_npu/test_tensor_npu.py
rename to test/test_npu/test_network_ops/test_tensor_npu.py
index abe6ce9b020af4fbfa8c26ab6322e8608a86b172..e0fdf11d358bf61bc29cd61188829017c8155b7f 100644
--- a/test/test_npu/test_tensor_npu.py
+++ b/test/test_npu/test_network_ops/test_tensor_npu.py
@@ -16,8 +16,8 @@
 
 import torch
 import numpy as np
-from torch.testing._internal.common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
 from util_test import create_common_tensor
 
 class TestTensorNpu(TestCase):
@@ -27,7 +27,7 @@ class TestTensorNpu(TestCase):
         return output
 
     def npu_op_exec(self, input):
-        output = torch.npu()
+        output = input.npu()
         output = output.to("cpu")
         return output
 
@@ -37,11 +37,11 @@ class TestTensorNpu(TestCase):
         return output
 
     def npu_type_exec(self, input):
-        output = torch.npu()
+        output = input.npu()
         output = output.is_npu
         return output
 
-    def test_tensor_npu_shape_format(self):
+    def test_tensor_npu_shape_format(self, device):
         shape_format = [
                 [np.float32, 0, 1],
                 [np.float32, 0, (64, 10)],
@@ -53,9 +53,9 @@ class TestTensorNpu(TestCase):
             cpu_input, npu_input = create_common_tensor(item, 1, 100)
             cpu_output = self.cpu_op_exec(cpu_input)
             npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output.cpu())
 
-    def test_is_npu_shape_format(self):
+    def test_is_npu_shape_format(self, device):
         shape_format = [
                 [np.float32, 0, 1],
                 [np.float32, 0, (64, 10)],
@@ -70,5 +70,6 @@ class TestTensorNpu(TestCase):
             self.assertEqual(cpu_output, False)
             self.assertEqual(npu_output, True)
 
+instantiate_device_type_tests(TestTensorNpu, globals(), except_for='cpu')
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_npu/test_thnn_conv_depthwise2d_backward.py b/test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_backward.py
similarity index 100%
rename from test/test_npu/test_thnn_conv_depthwise2d_backward.py
rename to test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_backward.py
diff --git a/test/test_npu/test_thnn_conv_depthwise2d_forward.py b/test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_forward.py
similarity index 100%
rename from test/test_npu/test_thnn_conv_depthwise2d_forward.py
rename to test/test_npu/test_network_ops/test_thnn_conv_depthwise2d_forward.py
diff --git a/test/test_npu/test_threshold_grad_v2_d.py b/test/test_npu/test_network_ops/test_threshold_grad_v2_d.py
similarity index 99%
rename from test/test_npu/test_threshold_grad_v2_d.py
rename to test/test_npu/test_network_ops/test_threshold_grad_v2_d.py
index f4307075bb0e9bc0b1915515fea7a4ab6f7523a6..d5baa926e98c387e1207dd934692c400b990084f 100644
--- a/test/test_npu/test_threshold_grad_v2_d.py
+++ b/test/test_npu/test_network_ops/test_threshold_grad_v2_d.py
@@ -79,5 +79,4 @@ class TestThresholdGradV2DBackward(TestCase):
 instantiate_device_type_tests(TestThresholdGradV2DBackward, globals(), except_for='cpu')
 
 if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
     run_tests()
diff --git a/test/test_npu/test_threshold_v2_d.py b/test/test_npu/test_network_ops/test_threshold_v2_d.py
similarity index 100%
rename from test/test_npu/test_threshold_v2_d.py
rename to test/test_npu/test_network_ops/test_threshold_v2_d.py
diff --git a/test/test_npu/test_trapz_dx.py b/test/test_npu/test_network_ops/test_trapz_dx.py
similarity index 96%
rename from test/test_npu/test_trapz_dx.py
rename to test/test_npu/test_network_ops/test_trapz_dx.py
index 900d890c4c4848a464ec32ef4787ff32a8a9db9f..c2e2d4aa9c941cf0a2c0ff491728e1e22076da3e 100644
--- a/test/test_npu/test_trapz_dx.py
+++ b/test/test_npu/test_network_ops/test_trapz_dx.py
@@ -20,7 +20,7 @@ import sys
 import copy
 from common_utils import TestCase, run_tests
 from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor,compare_res_new
+from util_test import create_common_tensor
 
 
 class TestTrapzDx(TestCase):
@@ -76,5 +76,4 @@ class TestTrapzDx(TestCase):
         
 instantiate_device_type_tests(TestTrapzDx, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_trapz_x.py b/test/test_npu/test_network_ops/test_trapz_x.py
similarity index 99%
rename from test/test_npu/test_trapz_x.py
rename to test/test_npu/test_network_ops/test_trapz_x.py
index 2be857a74fe85de99f0f7a828636fea9cd3457cf..84e563bb56c0a174a25293d76ee6640886a9efa4 100644
--- a/test/test_npu/test_trapz_x.py
+++ b/test/test_npu/test_network_ops/test_trapz_x.py
@@ -88,5 +88,4 @@ class TestTrapzX(TestCase):
 
 instantiate_device_type_tests(TestTrapzX, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
     run_tests()
diff --git a/test/test_npu/test_true_divide.py b/test/test_npu/test_network_ops/test_true_divide.py
similarity index 99%
rename from test/test_npu/test_true_divide.py
rename to test/test_npu/test_network_ops/test_true_divide.py
index 4beaf0cd357a3e8fc2a3d1bebd75b506ad2c47de..24f06fe4b555d5b73cfdff84cb3ac1890354a18e 100644
--- a/test/test_npu/test_true_divide.py
+++ b/test/test_npu/test_network_ops/test_true_divide.py
@@ -128,5 +128,4 @@ class TestTrueDivide(TestCase):
 instantiate_device_type_tests(TestTrueDivide, globals() , except_for='cpu')
 
 if __name__ == "__main__":
-    torch.npu.set_device("npu:7")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_trunc.py b/test/test_npu/test_network_ops/test_trunc.py
similarity index 98%
rename from test/test_npu/test_trunc.py
rename to test/test_npu/test_network_ops/test_trunc.py
index 806c9f6aa1dbaec3dae7348abb105cdd545da940..953a850bc57647c714e725ad08b03c7600f20da5 100644
--- a/test/test_npu/test_trunc.py
+++ b/test/test_npu/test_network_ops/test_trunc.py
@@ -79,7 +79,6 @@ class TestTrunc(TestCase):
 
 instantiate_device_type_tests(TestTrunc, globals(), except_for='cpu')
 if __name__=="__main__":
-    torch.npu.set_device("npu:7")
     run_tests()
 
 
diff --git a/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py b/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py
index 8b344664401627b46ea28057c1633fd8b5844c3d..0493dc0f6664b6e803f288f59693ccb25e28c968 100644
--- a/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py
+++ b/test/test_npu/test_network_ops/test_unbind_copy_contiguous.py
@@ -61,7 +61,6 @@ class TestUnbindToContiguous(TestCase):
             cpu_time += cpu_end - cpu_start
             npu_time += npu_end - npu_start
             self.assertRtolEqual(cpu_output, npu_output)
-        self.assertTrue(npu_time < 15, f"execute time:{npu_time:.2f}s should be less than 15s")
         print(f"unbind to contiguous use: {cpu_time:.5f} s (CPU)")
         print(f"unbind to contiguous use: {npu_time:.5f} s (NPU)")
         print(f"TBE Ops used: Slice")
diff --git a/test/test_npu/test_upsample_bilinear2d.py b/test/test_npu/test_network_ops/test_upsample_bilinear2d.py
similarity index 98%
rename from test/test_npu/test_upsample_bilinear2d.py
rename to test/test_npu/test_network_ops/test_upsample_bilinear2d.py
index 22c597f833c681003728e2db93c768130ae06ec2..c3e30ede9d18d4a653586a5ad180efc83a439ad9 100644
--- a/test/test_npu/test_upsample_bilinear2d.py
+++ b/test/test_npu/test_network_ops/test_upsample_bilinear2d.py
@@ -54,5 +54,4 @@ class TestUpsampleBilinear2d(TestCase):
 
 instantiate_device_type_tests(TestUpsampleBilinear2d, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:7")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_upsample_linear1d.py b/test/test_npu/test_network_ops/test_upsample_linear1d.py
similarity index 99%
rename from test/test_npu/test_upsample_linear1d.py
rename to test/test_npu/test_network_ops/test_upsample_linear1d.py
index 982b1a6eeb5e6285e6156493446e5d600b9e38c7..e2e7478b7c79b9cff3ec029fe64f4b65047e8200 100644
--- a/test/test_npu/test_upsample_linear1d.py
+++ b/test/test_npu/test_network_ops/test_upsample_linear1d.py
@@ -106,5 +106,4 @@ class TestUpsampleLinear1D(TestCase):
 
 instantiate_device_type_tests(TestUpsampleLinear1D, globals(), except_for="cpu")
 if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
     run_tests()
diff --git a/test/test_npu/test_upsample_nearest2d.py b/test/test_npu/test_network_ops/test_upsample_nearest2d.py
similarity index 100%
rename from test/test_npu/test_upsample_nearest2d.py
rename to test/test_npu/test_network_ops/test_upsample_nearest2d.py
diff --git a/test/test_npu/test_upsample_nearest2d_backward.py b/test/test_npu/test_network_ops/test_upsample_nearest2d_backward.py
similarity index 100%
rename from test/test_npu/test_upsample_nearest2d_backward.py
rename to test/test_npu/test_network_ops/test_upsample_nearest2d_backward.py
diff --git a/test/test_npu/test_network_ops/test_upsample_nearest3d.py b/test/test_npu/test_network_ops/test_upsample_nearest3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aae87d2d9c245c18e84ba2ef25ddf29d298bb7d
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_upsample_nearest3d.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+# 3d need input's dim is 5
+class TestUpsamleNearest3D(TestCase):
+    def cpu_op_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, size, mode="nearest")
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, size, mode="nearest")
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_scale_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest")
+        output = output.numpy()
+        return output
+
+    def npu_op_scale_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest")
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_upsample_nearest3d_shape_format(self, device):
+        shape_format = [
+                        [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+                        [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+                        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_exec(cpu_input, size)
+            npu_output = self.npu_op_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_upsample_nearest3d_shape_format_scale(self, device):
+        shape_format = [
+                        [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+                        [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+                        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_scale_exec(cpu_input, size)
+            npu_output = self.npu_op_scale_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestUpsamleNearest3D, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_upsample_nearest3d_backward.py b/test/test_npu/test_network_ops/test_upsample_nearest3d_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d303af74f5b24509c0993923da75f1d272aecaf
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_upsample_nearest3d_backward.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+# 3d need input's dim is 5
+class TestUpsamleNearest3DBackward(TestCase):
+    def cpu_op_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, size, mode="nearest")
+        output.sum().backward()
+        output = input.grad.numpy()
+        return output
+
+    def npu_op_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, size, mode="nearest")
+        output.sum().backward()
+        output = input.grad.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_scale_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest")
+        output.sum().backward()
+        output = input.grad.numpy()
+        return output
+
+    def npu_op_scale_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="nearest")
+        output.sum().backward()
+        output = input.grad.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_upsample_nearest3d_backward_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+            [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_exec(cpu_input, size)
+            npu_output = self.npu_op_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_upsample_nearest3d_backward_shape_format_scale(self, device):
+        shape_format = [
+            [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+            [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_scale_exec(cpu_input, size)
+            npu_output = self.npu_op_scale_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestUpsamleNearest3DBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_upsample_trilinear3d.py b/test/test_npu/test_network_ops/test_upsample_trilinear3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bdb4a5907dac56c2d6c58ec8777e7981ad3458c
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_upsample_trilinear3d.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+# 3d need input's dim is 5
+class TestUpsamleTrilinear3D(TestCase):
+    def cpu_op_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, size, mode="trilinear")
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, size, mode="trilinear")
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_scale_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear")
+        output = output.numpy()
+        return output
+
+    def npu_op_scale_exec(self, input, size):
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear")
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_upsample_trilinear3d_shape_format(self, device):
+        shape_format = [
+                        [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+                        [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+                        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_exec(cpu_input, size)
+            npu_output = self.npu_op_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_upsample_trilinear3d_shape_format_scale(self, device):
+        shape_format = [
+                        [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+                        [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+                        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_scale_exec(cpu_input, size)
+            npu_output = self.npu_op_scale_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestUpsamleTrilinear3D, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_upsample_trilinear3d_backward.py b/test/test_npu/test_network_ops/test_upsample_trilinear3d_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..24b9c92b2cfdfd15c448a82ed7550f3e6124aeec
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_upsample_trilinear3d_backward.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+# 3d need input's dim is 5
+class TestUpsamleTrilinear3DBackward(TestCase):
+    def cpu_op_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, size, mode="trilinear")
+        output.sum().backward()
+        output = input.grad.numpy()
+        return output
+
+    def npu_op_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, size, mode="trilinear")
+        output.sum().backward()
+        output = input.grad.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_scale_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear")
+        output.sum().backward()
+        output = input.grad.numpy()
+        return output
+
+    def npu_op_scale_exec(self, input, size):
+        input.requires_grad_(True)
+        output = torch.nn.functional.interpolate(input, scale_factor=size, mode="trilinear")
+        output.sum().backward()
+        output = input.grad.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_upsample_trilinear3d_backward_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+            [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_exec(cpu_input, size)
+            npu_output = self.npu_op_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_upsample_trilinear3d_backward_shape_format_scale(self, device):
+        shape_format = [
+            [[np.float32, -1, (5, 3, 2, 6, 4)], [10, 10, 10]],
+            [[np.float32, -1, (2, 3, 6, 2, 4)], [10, 10, 10]],
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 50)
+            if cpu_input == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            size = item[1]
+            cpu_output = self.cpu_op_scale_exec(cpu_input, size)
+            npu_output = self.npu_op_scale_exec(npu_input, size)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestUpsamleTrilinear3DBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_trace.py b/test/test_npu/test_trace.py
deleted file mode 100644
index d4cf66f10a1ef3f51057d0095170fe20917b4a2c..0000000000000000000000000000000000000000
--- a/test/test_npu/test_trace.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import copy
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-LOWER = 0
-UPPER = 2
-INT_UPPER = 5
-
-
-class TestTrace(TestCase):
-
-    def generate_one_input(self, lower, upper, shape, dtype):
-        input1 = np.random.uniform(lower, upper, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-        return npu_input1
-
-
-    def cpu_op_exec(self, input1):
-        res = torch.trace(input1)
-        return res.numpy()
-
-
-    def cpu_op_exec_half(self, input1):
-        res = torch.trace(input1)
-        return res.type(torch.float16).numpy()
-
-
-    def npu_op_exec(self, input1):
-        input1 = input1.to("npu")
-        res = torch.trace(input1)
-        res = res.to("cpu")
-        return res.numpy()
-
-
-    def test_trace_float32(self, device):
-        for shape in [(10, 10), (10, 11), (11, 10)]:
-            input1 = generate_one_input(LOWER, UPPER, shape, np.float32)
-            cpu_output = cpu_op_exec(input1)
-            npu_output = npu_op_exec(input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_trace_float16(self, device):
-        shape = (10, 10)
-        input1 = generate_one_input(LOWER, UPPER, shape, np.float16)
-        cpu_output = cpu_op_exec_half(input1.type(torch.float32))
-        npu_output = npu_op_exec(input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_trace_int(self, device):
-        for shape, dtype in [
-            ((10, 10), np.uint8),
-            ((10, 10), np.int8),
-            ((10, 10), np.int32) 
-        ]:
-            input1 = np.random.randint(LOWER, INT_UPPER, shape, dtype)
-            input1 = torch.from_numpy(input1)
-            cpu_output = torch.trace(input1).numpy().astype(np.int32)
-            input_npu = input1.to("npu")
-            npu_output = torch.trace(input_npu)
-            npu_output = npu_output.to("cpu").numpy().astype(np.int32)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestTrace, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
-    run_tests()